diff --git a/README.md b/README.md index 6b74ef151..1194d1eab 100644 --- a/README.md +++ b/README.md @@ -106,7 +106,9 @@ Read more about how to install the example webhook [here](deploy/kubernetes/webh * `--leader-election-namespace `: The namespace where the leader election resource exists. Defaults to the pod namespace if not set. -* `--metrics-address`: The TCP network address where the prometheus metrics endpoint will run (example: `:8080` which corresponds to port 8080 on local host). The default is empty string, which means metrics endpoint is disabled. +* `--http-endpoint`: The TCP network address where the HTTP server for diagnostics, including metrics and leader election health check, will listen (example: `:8080` which corresponds to port 8080 on local host). The default is empty string, which means the server is disabled. + +* `--metrics-address`: (deprecated) The TCP network address where the prometheus metrics endpoint will run (example: `:8080` which corresponds to port 8080 on local host). The default is empty string, which means metrics endpoint is disabled. * `--metrics-path`: The HTTP path where prometheus metrics will be exposed. Default is `/metrics`. @@ -147,6 +149,13 @@ Read more about how to install the example webhook [here](deploy/kubernetes/webh * All glog / klog arguments are supported, such as `-v ` or `-alsologtostderr`. +#### HTTP endpoint + +The external-snapshotter optionally exposes an HTTP endpoint at address:port specified by `--http-endpoint` argument. When set, these two paths are exposed: + +* Metrics path, as set by `--metrics-path` argument (default is `/metrics`). + +* Leader election health check at `/healthz/leader-election`. It is recommended to run a liveness probe against this endpoint when leader election is used to kill external-provisioner leader that fails to connect to the API server to renew its leadership. See https://github.com/kubernetes-csi/csi-lib-utils/issues/66 for details. ## Upgrade from v1alpha1 to v1beta1 diff --git a/cmd/csi-snapshotter/main.go b/cmd/csi-snapshotter/main.go index 8c606da21..28e8aa2ae 100644 --- a/cmd/csi-snapshotter/main.go +++ b/cmd/csi-snapshotter/main.go @@ -20,6 +20,7 @@ import ( "context" "flag" "fmt" + "net/http" "os" "os/signal" "strings" @@ -66,7 +67,8 @@ var ( leaderElection = flag.Bool("leader-election", false, "Enables leader election.") leaderElectionNamespace = flag.String("leader-election-namespace", "", "The namespace where the leader election resource exists. Defaults to the pod namespace if not set.") - metricsAddress = flag.String("metrics-address", "", "The TCP network address where the prometheus metrics endpoint will listen (example: `:8080`). The default is empty string, which means metrics endpoint is disabled.") + metricsAddress = flag.String("metrics-address", "", "(deprecated) The TCP network address where the prometheus metrics endpoint will listen (example: `:8080`). The default is empty string, which means metrics endpoint is disabled. Only one of `--metrics-address` and `--http-endpoint` can be set.") + httpEndpoint = flag.String("http-endpoint", "", "The TCP network address where the HTTP server for diagnostics, including metrics and leader election health check, will listen (example: `:8080`). The default is empty string, which means the server is disabled. Only one of `--metrics-address` and `--http-endpoint` can be set.") metricsPath = flag.String("metrics-path", "/metrics", "The HTTP path where prometheus metrics will be exposed. Default is `/metrics`.") ) @@ -111,6 +113,15 @@ func main() { // Add Snapshot types to the default Kubernetes so events can be logged for them snapshotscheme.AddToScheme(scheme.Scheme) + if *metricsAddress != "" && *httpEndpoint != "" { + klog.Error("only one of `--metrics-address` and `--http-endpoint` can be set.") + os.Exit(1) + } + addr := *metricsAddress + if addr == "" { + addr = *httpEndpoint + } + // Connect to CSI. metricsManager := metrics.NewCSIMetricsManager("" /* driverName */) csiConn, err := connection.Connect( @@ -134,8 +145,20 @@ func main() { } klog.V(2).Infof("CSI driver name: %q", driverName) - metricsManager.SetDriverName(driverName) - metricsManager.StartMetricsEndpoint(*metricsAddress, *metricsPath) + + // Prepare http endpoint for metrics + leader election healthz + mux := http.NewServeMux() + if addr != "" { + metricsManager.RegisterToServer(mux, *metricsPath) + metricsManager.SetDriverName(driverName) + go func() { + klog.Infof("ServeMux listening at %q", addr) + err := http.ListenAndServe(*metricsAddress, mux) + if err != nil { + klog.Fatalf("Failed to start HTTP server at specified address (%q) and metrics path (%q): %s", addr, *metricsPath, err) + } + }() + } // Check it's ready if err = csirpc.ProbeForever(csiConn, *csiTimeout); err != nil { @@ -200,6 +223,10 @@ func main() { klog.Fatalf("failed to create leaderelection client: %v", err) } le := leaderelection.NewLeaderElection(leClientset, lockName, run) + if *httpEndpoint != "" { + le.PrepareHealthCheck(mux, leaderelection.DefaultHealthCheckTimeout) + } + if *leaderElectionNamespace != "" { le.WithNamespace(*leaderElectionNamespace) } diff --git a/cmd/snapshot-controller/main.go b/cmd/snapshot-controller/main.go index 8df295cb6..0b6e84f40 100644 --- a/cmd/snapshot-controller/main.go +++ b/cmd/snapshot-controller/main.go @@ -22,6 +22,7 @@ import ( "fmt" "os" "os/signal" + "sync" "time" "k8s.io/client-go/kubernetes" @@ -32,6 +33,7 @@ import ( "github.com/kubernetes-csi/csi-lib-utils/leaderelection" controller "github.com/kubernetes-csi/external-snapshotter/v3/pkg/common-controller" + "github.com/kubernetes-csi/external-snapshotter/v3/pkg/metrics" clientset "github.com/kubernetes-csi/external-snapshotter/client/v3/clientset/versioned" snapshotscheme "github.com/kubernetes-csi/external-snapshotter/client/v3/clientset/versioned/scheme" @@ -48,6 +50,9 @@ var ( leaderElection = flag.Bool("leader-election", false, "Enables leader election.") leaderElectionNamespace = flag.String("leader-election-namespace", "", "The namespace where the leader election resource exists. Defaults to the pod namespace if not set.") + + httpEndpoint = flag.String("http-endpoint", "", "The TCP network address where the HTTP server for diagnostics, including metrics, will listen (example: :8080). The default is empty string, which means the server is disabled.") + metricsPath = flag.String("metrics-path", "/metrics", "The HTTP path where prometheus metrics will be exposed. Default is `/metrics`.") ) var ( @@ -87,6 +92,28 @@ func main() { factory := informers.NewSharedInformerFactory(snapClient, *resyncPeriod) coreFactory := coreinformers.NewSharedInformerFactory(kubeClient, *resyncPeriod) + // Create and register metrics manager + metricsManager := metrics.NewMetricsManager() + wg := &sync.WaitGroup{} + wg.Add(1) + if *httpEndpoint != "" { + srv, err := metricsManager.StartMetricsEndpoint(*metricsPath, *httpEndpoint, promklog{}, wg) + if err != nil { + klog.Errorf("Failed to start metrics server: %s", err.Error()) + os.Exit(1) + } + defer func() { + err := srv.Shutdown(context.Background()) + if err != nil { + klog.Errorf("Failed to shutdown metrics server: %s", err.Error()) + } + + klog.Infof("Metrics server successfully shutdown") + wg.Done() + }() + klog.Infof("Metrics server successfully started on %s, %s", *httpEndpoint, *metricsPath) + } + // Add Snapshot types to the default Kubernetes so events can be logged for them snapshotscheme.AddToScheme(scheme.Scheme) @@ -99,6 +126,7 @@ func main() { factory.Snapshot().V1beta1().VolumeSnapshotContents(), factory.Snapshot().V1beta1().VolumeSnapshotClasses(), coreFactory.Core().V1().PersistentVolumeClaims(), + metricsManager, *resyncPeriod, ) @@ -142,3 +170,9 @@ func buildConfig(kubeconfig string) (*rest.Config, error) { } return rest.InClusterConfig() } + +type promklog struct{} + +func (pl promklog) Println(v ...interface{}) { + klog.Error(v...) +} diff --git a/go.mod b/go.mod index 09de7e58a..6a17d0dfb 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,7 @@ require ( github.com/golang/protobuf v1.4.2 github.com/google/gofuzz v1.1.0 github.com/imdario/mergo v0.3.9 // indirect - github.com/kubernetes-csi/csi-lib-utils v0.8.1 + github.com/kubernetes-csi/csi-lib-utils v0.9.0 github.com/kubernetes-csi/csi-test v2.0.0+incompatible github.com/kubernetes-csi/external-snapshotter/client/v3 v3.0.0 github.com/prometheus/client_golang v1.7.1 diff --git a/go.sum b/go.sum index 51c56332e..5134859d0 100644 --- a/go.sum +++ b/go.sum @@ -38,7 +38,6 @@ github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb0 github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= github.com/PuerkitoBio/purell v1.0.0/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= github.com/PuerkitoBio/purell v1.1.0/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= -github.com/PuerkitoBio/purell v1.1.1 h1:WEQqlqaGbrPkxLJWfBwQmfEAE1Z7ONdDLqrN38tNFfI= github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= github.com/PuerkitoBio/urlesc v0.0.0-20160726150825-5bd2802263f2/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE= github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE= @@ -329,8 +328,8 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/pty v1.1.5/go.mod h1:9r2w37qlBe7rQ6e1fg1S/9xpWHSnaqNdHD3WcMdbPDA= github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= -github.com/kubernetes-csi/csi-lib-utils v0.8.1 h1:DHFs4MgzjSGF/FH95TEdLVa7R1CCi9UJ76jTUPO8iF0= -github.com/kubernetes-csi/csi-lib-utils v0.8.1/go.mod h1:FZflf0cCYlCquPQxVHa6Tyy0i/so6VAZTiEVK1do7CU= +github.com/kubernetes-csi/csi-lib-utils v0.9.0 h1:TbuDmxoVqM+fvVkzG/7sShyX/8jUln0ElLHuETcsQJI= +github.com/kubernetes-csi/csi-lib-utils v0.9.0/go.mod h1:8E2jVUX9j3QgspwHXa6LwyN7IHQDjW9jX3kwoWnSC+M= github.com/kubernetes-csi/csi-test v2.0.0+incompatible h1:ia04uVFUM/J9n/v3LEMn3rEG6FmKV5BH9QLw7H68h44= github.com/kubernetes-csi/csi-test v2.0.0+incompatible/go.mod h1:YxJ4UiuPWIhMBkxUKY5c267DyA0uDZ/MtAimhx/2TA0= github.com/kylelemons/godebug v0.0.0-20170820004349-d65d576e9348/go.mod h1:B69LEHPfb2qLo0BaaOLcbitczOKLWTsrBG9LczfCD4k= @@ -479,7 +478,6 @@ github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+ github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.5.1 h1:nOGnQDM7FYENwehXlg/kFVnos3rEvtKTjRvOWSzb6H4= github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= diff --git a/pkg/common-controller/framework_test.go b/pkg/common-controller/framework_test.go index bc481c91f..326329ae9 100644 --- a/pkg/common-controller/framework_test.go +++ b/pkg/common-controller/framework_test.go @@ -34,6 +34,7 @@ import ( snapshotscheme "github.com/kubernetes-csi/external-snapshotter/client/v3/clientset/versioned/scheme" informers "github.com/kubernetes-csi/external-snapshotter/client/v3/informers/externalversions" storagelisters "github.com/kubernetes-csi/external-snapshotter/client/v3/listers/volumesnapshot/v1beta1" + "github.com/kubernetes-csi/external-snapshotter/v3/pkg/metrics" "github.com/kubernetes-csi/external-snapshotter/v3/pkg/utils" v1 "k8s.io/api/core/v1" storagev1 "k8s.io/api/storage/v1" @@ -750,6 +751,10 @@ func newTestController(kubeClient kubernetes.Interface, clientset clientset.Inte } coreFactory := coreinformers.NewSharedInformerFactory(kubeClient, utils.NoResyncPeriodFunc()) + metricsManager := metrics.NewMetricsManager() + wg := &sync.WaitGroup{} + wg.Add(1) + metricsManager.StartMetricsEndpoint("/metrics", "localhost:0", nil, wg) ctrl := NewCSISnapshotCommonController( clientset, @@ -758,6 +763,7 @@ func newTestController(kubeClient kubernetes.Interface, clientset clientset.Inte informerFactory.Snapshot().V1beta1().VolumeSnapshotContents(), informerFactory.Snapshot().V1beta1().VolumeSnapshotClasses(), coreFactory.Core().V1().PersistentVolumeClaims(), + metricsManager, 60*time.Second, ) diff --git a/pkg/common-controller/snapshot_controller.go b/pkg/common-controller/snapshot_controller.go index 6ebbbd759..ebe8c5043 100644 --- a/pkg/common-controller/snapshot_controller.go +++ b/pkg/common-controller/snapshot_controller.go @@ -33,6 +33,7 @@ import ( klog "k8s.io/klog/v2" crdv1 "github.com/kubernetes-csi/external-snapshotter/client/v3/apis/volumesnapshot/v1beta1" + "github.com/kubernetes-csi/external-snapshotter/v3/pkg/metrics" "github.com/kubernetes-csi/external-snapshotter/v3/pkg/utils" ) @@ -236,6 +237,20 @@ func (ctrl *csiSnapshotCommonController) syncSnapshot(snapshot *crdv1.VolumeSnap // 2. Call checkandRemoveSnapshotFinalizersAndCheckandDeleteContent() with information obtained from step 1. This function name is very long but the name suggests what it does. It determines whether to remove finalizers on snapshot and whether to delete content. func (ctrl *csiSnapshotCommonController) processSnapshotWithDeletionTimestamp(snapshot *crdv1.VolumeSnapshot) error { klog.V(5).Infof("processSnapshotWithDeletionTimestamp VolumeSnapshot[%s]: %s", utils.SnapshotKey(snapshot), utils.GetSnapshotStatusForLogging(snapshot)) + driverName, err := ctrl.getSnapshotDriverName(snapshot) + if err != nil { + klog.Errorf("failed to getSnapshotDriverName while recording metrics for snapshot %q: %v", utils.SnapshotKey(snapshot), err) + } + + snapshotProvisionType := metrics.DynamicSnapshotType + if snapshot.Spec.Source.VolumeSnapshotContentName != nil { + snapshotProvisionType = metrics.PreProvisionedSnapshotType + } + + // Processing delete, start operation metric + deleteOperationKey := metrics.NewOperationKey(metrics.DeleteSnapshotOperationName, snapshot.UID) + deleteOperationValue := metrics.NewOperationValue(driverName, snapshotProvisionType) + ctrl.metricsManager.OperationStart(deleteOperationKey, deleteOperationValue) var contentName string if snapshot.Status != nil && snapshot.Status.BoundVolumeSnapshotContentName != nil { @@ -270,6 +285,7 @@ func (ctrl *csiSnapshotCommonController) processSnapshotWithDeletionTimestamp(sn } klog.V(5).Infof("processSnapshotWithDeletionTimestamp[%s]: delete snapshot content and remove finalizer from snapshot if needed", utils.SnapshotKey(snapshot)) + return ctrl.checkandRemoveSnapshotFinalizersAndCheckandDeleteContent(snapshot, content, deleteContent) } @@ -389,6 +405,7 @@ func (ctrl *csiSnapshotCommonController) syncReadySnapshot(snapshot *crdv1.Volum // snapshot is bound but content is not pointing to the snapshot return ctrl.updateSnapshotErrorStatusWithEvent(snapshot, v1.EventTypeWarning, "SnapshotMisbound", "VolumeSnapshotContent is not bound to the VolumeSnapshot correctly") } + // everything is verified, return return nil } @@ -397,6 +414,28 @@ func (ctrl *csiSnapshotCommonController) syncReadySnapshot(snapshot *crdv1.Volum func (ctrl *csiSnapshotCommonController) syncUnreadySnapshot(snapshot *crdv1.VolumeSnapshot) error { uniqueSnapshotName := utils.SnapshotKey(snapshot) klog.V(5).Infof("syncUnreadySnapshot %s", uniqueSnapshotName) + driverName, err := ctrl.getSnapshotDriverName(snapshot) + if err != nil { + klog.Errorf("failed to getSnapshotDriverName while recording metrics for snapshot %q: %s", utils.SnapshotKey(snapshot), err) + } + + snapshotProvisionType := metrics.DynamicSnapshotType + if snapshot.Spec.Source.VolumeSnapshotContentName != nil { + snapshotProvisionType = metrics.PreProvisionedSnapshotType + } + + // Start metrics operations + if !utils.IsSnapshotCreated(snapshot) { + // Only start CreateSnapshot operation if the snapshot has not been cut + ctrl.metricsManager.OperationStart( + metrics.NewOperationKey(metrics.CreateSnapshotOperationName, snapshot.UID), + metrics.NewOperationValue(driverName, snapshotProvisionType), + ) + } + ctrl.metricsManager.OperationStart( + metrics.NewOperationKey(metrics.CreateSnapshotAndReadyOperationName, snapshot.UID), + metrics.NewOperationValue(driverName, snapshotProvisionType), + ) // Pre-provisioned snapshot if snapshot.Spec.Source.VolumeSnapshotContentName != nil { @@ -404,13 +443,16 @@ func (ctrl *csiSnapshotCommonController) syncUnreadySnapshot(snapshot *crdv1.Vol if err != nil { return err } + // if no content found yet, update status and return if content == nil { // can not find the desired VolumeSnapshotContent from cache store ctrl.updateSnapshotErrorStatusWithEvent(snapshot, v1.EventTypeWarning, "SnapshotContentMissing", "VolumeSnapshotContent is missing") klog.V(4).Infof("syncUnreadySnapshot[%s]: snapshot content %q requested but not found, will try again", utils.SnapshotKey(snapshot), *snapshot.Spec.Source.VolumeSnapshotContentName) + return fmt.Errorf("snapshot %s requests an non-existing content %s", utils.SnapshotKey(snapshot), *snapshot.Spec.Source.VolumeSnapshotContentName) } + // Set VolumeSnapshotRef UID newContent, err := ctrl.checkandBindSnapshotContent(snapshot, content) if err != nil { @@ -427,8 +469,10 @@ func (ctrl *csiSnapshotCommonController) syncUnreadySnapshot(snapshot *crdv1.Vol ctrl.updateSnapshotErrorStatusWithEvent(snapshot, v1.EventTypeWarning, "SnapshotStatusUpdateFailed", fmt.Sprintf("Snapshot status update failed, %v", err)) return err } + return nil } + // snapshot.Spec.Source.VolumeSnapshotContentName == nil - dynamically creating snapshot klog.V(5).Infof("getDynamicallyProvisionedContentFromStore for snapshot %s", uniqueSnapshotName) contentObj, err := ctrl.getDynamicallyProvisionedContentFromStore(snapshot) @@ -1095,10 +1139,30 @@ func (ctrl *csiSnapshotCommonController) updateSnapshotStatus(snapshot *crdv1.Vo if updated { snapshotClone := snapshotObj.DeepCopy() snapshotClone.Status = newStatus + + // We need to record metrics before updating the status due to a bug causing cache entries after a failed UpdateStatus call. + // Must meet the following criteria to emit a successful CreateSnapshot status + // 1. Previous status was nil OR Previous status had a nil CreationTime + // 2. New status must be non-nil with a non-nil CreationTime + driverName := content.Spec.Driver + createOperationKey := metrics.NewOperationKey(metrics.CreateSnapshotOperationName, snapshot.UID) + if !utils.IsSnapshotCreated(snapshotObj) && utils.IsSnapshotCreated(snapshotClone) { + ctrl.metricsManager.RecordMetrics(createOperationKey, metrics.NewSnapshotOperationStatus(metrics.SnapshotStatusTypeSuccess), driverName) + } + + // Must meet the following criteria to emit a successful CreateSnapshotAndReady status + // 1. Previous status was nil OR Previous status had a nil ReadyToUse OR Previous status had a false ReadyToUse + // 2. New status must be non-nil with a ReadyToUse as true + if !utils.IsSnapshotReady(snapshotObj) && utils.IsSnapshotReady(snapshotClone) { + createAndReadyOperation := metrics.NewOperationKey(metrics.CreateSnapshotAndReadyOperationName, snapshot.UID) + ctrl.metricsManager.RecordMetrics(createAndReadyOperation, metrics.NewSnapshotOperationStatus(metrics.SnapshotStatusTypeSuccess), driverName) + } + newSnapshotObj, err := ctrl.clientset.SnapshotV1beta1().VolumeSnapshots(snapshotClone.Namespace).UpdateStatus(context.TODO(), snapshotClone, metav1.UpdateOptions{}) if err != nil { return nil, newControllerUpdateError(utils.SnapshotKey(snapshot), err.Error()) } + return newSnapshotObj, nil } @@ -1186,8 +1250,49 @@ func (ctrl *csiSnapshotCommonController) getSnapshotClass(className string) (*cr return class, nil } -// SetDefaultSnapshotClass is a helper function to figure out the default snapshot class from -// PVC/PV StorageClass and update VolumeSnapshot with this snapshot class name. +// getSnapshotDriverName is a helper function to get snapshot driver from the VolumeSnapshot. +// We try to get the driverName in multiple ways, as snapshot controller metrics depend on the correct driverName. +func (ctrl *csiSnapshotCommonController) getSnapshotDriverName(vs *crdv1.VolumeSnapshot) (string, error) { + klog.V(5).Infof("getSnapshotDriverName: VolumeSnapshot[%s]", vs.Name) + var driverName string + + // Pre-Provisioned snapshots have contentName as source + var contentName string + if vs.Spec.Source.VolumeSnapshotContentName != nil { + contentName = *vs.Spec.Source.VolumeSnapshotContentName + } + + // Get Driver name from SnapshotContent if we found a contentName + if contentName != "" { + content, err := ctrl.contentLister.Get(contentName) + if err != nil { + klog.Errorf("getSnapshotDriverName: failed to get snapshotContent: %v", contentName) + } else { + driverName = content.Spec.Driver + } + + if driverName != "" { + return driverName, nil + } + } + + // Dynamic snapshots will have a snapshotclass with a driver + if vs.Spec.VolumeSnapshotClassName != nil { + class, err := ctrl.getSnapshotClass(*vs.Spec.VolumeSnapshotClassName) + if err != nil { + klog.Errorf("getSnapshotDriverName: failed to get snapshotClass: %v", *vs.Spec.VolumeSnapshotClassName) + } else { + driverName = class.Driver + } + } + + return driverName, nil +} + +// SetDefaultSnapshotClass is a helper function to figure out the default snapshot class. +// For pre-provisioned case, it's an no-op. +// For dynamic provisioning, it gets the default SnapshotClasses in the system if there is any(could be multiple), +// and finds the one with the same CSI Driver as the PV from which a snapshot will be taken. func (ctrl *csiSnapshotCommonController) SetDefaultSnapshotClass(snapshot *crdv1.VolumeSnapshot) (*crdv1.VolumeSnapshotClass, *crdv1.VolumeSnapshot, error) { klog.V(5).Infof("SetDefaultSnapshotClass for snapshot [%s]", snapshot.Name) diff --git a/pkg/common-controller/snapshot_controller_base.go b/pkg/common-controller/snapshot_controller_base.go index 2046b24c8..dbd8f9d08 100644 --- a/pkg/common-controller/snapshot_controller_base.go +++ b/pkg/common-controller/snapshot_controller_base.go @@ -24,6 +24,7 @@ import ( clientset "github.com/kubernetes-csi/external-snapshotter/client/v3/clientset/versioned" storageinformers "github.com/kubernetes-csi/external-snapshotter/client/v3/informers/externalversions/volumesnapshot/v1beta1" storagelisters "github.com/kubernetes-csi/external-snapshotter/client/v3/listers/volumesnapshot/v1beta1" + "github.com/kubernetes-csi/external-snapshotter/v3/pkg/metrics" "github.com/kubernetes-csi/external-snapshotter/v3/pkg/utils" v1 "k8s.io/api/core/v1" @@ -60,6 +61,8 @@ type csiSnapshotCommonController struct { snapshotStore cache.Store contentStore cache.Store + metricsManager metrics.MetricsManager + resyncPeriod time.Duration } @@ -71,6 +74,7 @@ func NewCSISnapshotCommonController( volumeSnapshotContentInformer storageinformers.VolumeSnapshotContentInformer, volumeSnapshotClassInformer storageinformers.VolumeSnapshotClassInformer, pvcInformer coreinformers.PersistentVolumeClaimInformer, + metricsManager metrics.MetricsManager, resyncPeriod time.Duration, ) *csiSnapshotCommonController { broadcaster := record.NewBroadcaster() @@ -80,14 +84,15 @@ func NewCSISnapshotCommonController( eventRecorder = broadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: fmt.Sprintf("snapshot-controller")}) ctrl := &csiSnapshotCommonController{ - clientset: clientset, - client: client, - eventRecorder: eventRecorder, - resyncPeriod: resyncPeriod, - snapshotStore: cache.NewStore(cache.DeletionHandlingMetaNamespaceKeyFunc), - contentStore: cache.NewStore(cache.DeletionHandlingMetaNamespaceKeyFunc), - snapshotQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "snapshot-controller-snapshot"), - contentQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "snapshot-controller-content"), + clientset: clientset, + client: client, + eventRecorder: eventRecorder, + resyncPeriod: resyncPeriod, + snapshotStore: cache.NewStore(cache.DeletionHandlingMetaNamespaceKeyFunc), + contentStore: cache.NewStore(cache.DeletionHandlingMetaNamespaceKeyFunc), + snapshotQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "snapshot-controller-snapshot"), + contentQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "snapshot-controller-content"), + metricsManager: metricsManager, } ctrl.pvcLister = pvcInformer.Lister() @@ -363,6 +368,7 @@ func (ctrl *csiSnapshotCommonController) updateSnapshot(snapshot *crdv1.VolumeSn if !newSnapshot { return nil } + err = ctrl.syncSnapshot(snapshot) if err != nil { if errors.IsConflict(err) { @@ -407,6 +413,13 @@ func (ctrl *csiSnapshotCommonController) updateContent(content *crdv1.VolumeSnap func (ctrl *csiSnapshotCommonController) deleteSnapshot(snapshot *crdv1.VolumeSnapshot) { _ = ctrl.snapshotStore.Delete(snapshot) klog.V(4).Infof("snapshot %q deleted", utils.SnapshotKey(snapshot)) + driverName, err := ctrl.getSnapshotDriverName(snapshot) + if err != nil { + klog.Errorf("failed to getSnapshotDriverName while recording metrics for snapshot %q: %s", utils.SnapshotKey(snapshot), err) + } else { + deleteOperationKey := metrics.NewOperationKey(metrics.DeleteSnapshotOperationName, snapshot.UID) + ctrl.metricsManager.RecordMetrics(deleteOperationKey, metrics.NewSnapshotOperationStatus(metrics.SnapshotStatusTypeSuccess), driverName) + } snapshotContentName := "" if snapshot.Status != nil && snapshot.Status.BoundVolumeSnapshotContentName != nil { @@ -416,6 +429,7 @@ func (ctrl *csiSnapshotCommonController) deleteSnapshot(snapshot *crdv1.VolumeSn klog.V(5).Infof("deleteSnapshot[%q]: content not bound", utils.SnapshotKey(snapshot)) return } + // sync the content when its snapshot is deleted. Explicitly sync'ing the // content here in response to snapshot deletion prevents the content from // waiting until the next sync period for its Release. diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index c283caf5b..b1f11ed01 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -30,13 +30,48 @@ import ( ) const ( - opStatusUnknown = "Unknown" - labelDriverName = "driver_name" - labelOperationName = "operation_name" - labelOperationStatus = "operation_status" - subSystem = "snapshot_controller" - metricName = "operation_total_seconds" - metricHelpMsg = "Total number of seconds spent by the controller on an operation from end to end" + labelDriverName = "driver_name" + labelOperationName = "operation_name" + labelOperationStatus = "operation_status" + labelSnapshotType = "snapshot_type" + subSystem = "snapshot_controller" + operationLatencyMetricName = "operation_total_seconds" + operationLatencyMetricHelpMsg = "Total number of seconds spent by the controller on an operation" + unknownDriverName = "unknown" + + // CreateSnapshotOperationName is the operation that tracks how long the controller takes to create a snapshot. + // Specifically, the operation metric is emitted based on the following timestamps: + // - Start_time: controller notices the first time that there is a new VolumeSnapshot CR to dynamically provision a snapshot + // - End_time: controller notices that the CR has a status with CreationTime field set to be non-nil + CreateSnapshotOperationName = "CreateSnapshot" + + // CreateSnapshotAndReadyOperationName is the operation that tracks how long the controller takes to create a snapshot and for it to be ready. + // Specifically, the operation metric is emitted based on the following timestamps: + // - Start_time: controller notices the first time that there is a new VolumeSnapshot CR(both dynamic and pre-provisioned cases) + // - End_time: controller notices that the CR has a status with Ready field set to be true + CreateSnapshotAndReadyOperationName = "CreateSnapshotAndReady" + + // DeleteSnapshotOperationName is the operation that tracks how long a snapshot deletion takes. + // Specifically, the operation metric is emitted based on the following timestamps: + // - Start_time: controller notices the first time that there is a deletion timestamp placed on the VolumeSnapshot CR and the CR is ready to be deleted. Note that if the CR is being used by a PVC for rehydration, the controller should *NOT* set the start_time. + // - End_time: controller removed all finalizers on the VolumeSnapshot CR such that the CR is ready to be removed in the API server. + DeleteSnapshotOperationName = "DeleteSnapshot" + + // DynamicSnapshotType represents a snapshot that is being dynamically provisioned + DynamicSnapshotType = snapshotProvisionType("dynamic") + // PreProvisionedSnapshotType represents a snapshot that is pre-provisioned + PreProvisionedSnapshotType = snapshotProvisionType("pre-provisioned") + + // SnapshotStatusTypeUnknown represents that the status is unknown + SnapshotStatusTypeUnknown snapshotStatusType = "unknown" + // Success and Cancel are statuses for operation time (operation_total_seconds) as seen by snapshot controller + // SnapshotStatusTypeSuccess represents that a CreateSnapshot, CreateSnapshotAndReady, + // or DeleteSnapshot has finished successfully. + // Individual reconciliations (reconciliation_total_seconds) also use this status. + SnapshotStatusTypeSuccess snapshotStatusType = "success" + // SnapshotStatusTypeCancel represents that a CreateSnapshot, CreateSnapshotAndReady, + // or DeleteSnapshot has been deleted before finishing. + SnapshotStatusTypeCancel snapshotStatusType = "cancel" ) // OperationStatus is the interface type for representing an operation's execution @@ -57,11 +92,11 @@ type MetricsManager interface { // OperationStart takes in an operation and caches its start time. // if the operation already exists, it's an no-op. - OperationStart(op Operation) + OperationStart(key OperationKey, val OperationValue) // DropOperation removes an operation from cache. // if the operation does not exist, it's an no-op. - DropOperation(op Operation) + DropOperation(op OperationKey) // RecordMetrics records a metric point. Note that it will be an no-op if an // operation has NOT been marked "Started" previously via invoking "OperationStart". @@ -69,20 +104,49 @@ type MetricsManager interface { // op - the operation which the metric is associated with. // status - the operation status, if not specified, i.e., status == nil, an // "Unknown" status of the passed-in operation is assumed. - RecordMetrics(op Operation, status OperationStatus) + RecordMetrics(op OperationKey, status OperationStatus, driverName string) } -// Operation is a structure which holds information to identify a snapshot -// related operation -type Operation struct { - // the name of the operation, for example: "CreateSnapshot", "DeleteSnapshot" +// OperationKey is a structure which holds information to +// uniquely identify a snapshot related operation +type OperationKey struct { + // Name is the name of the operation, for example: "CreateSnapshot", "DeleteSnapshot" Name string - // the name of the driver which executes the operation - Driver string - // the resource UID to which the operation has been executed against + // ResourceID is the resource UID to which the operation has been executed against ResourceID types.UID } +// OperationValue is a structure which holds operation metadata +type OperationValue struct { + // Driver is the driver name which executes the operation + Driver string + // SnapshotType represents the snapshot type, for example: "dynamic", "pre-provisioned" + SnapshotType string + + // startTime is the time when the operation first started + startTime time.Time +} + +// NewOperationKey initializes a new OperationKey +func NewOperationKey(name string, snapshotUID types.UID) OperationKey { + return OperationKey{ + Name: name, + ResourceID: snapshotUID, + } +} + +// NewOperationValue initializes a new OperationValue +func NewOperationValue(driver string, snapshotType snapshotProvisionType) OperationValue { + if driver == "" { + driver = unknownDriverName + } + + return OperationValue{ + Driver: driver, + SnapshotType: string(snapshotType), + } +} + type operationMetricsManager struct { // cache is a concurrent-safe map which stores start timestamps for all // ongoing operations. @@ -93,10 +157,11 @@ type operationMetricsManager struct { // registry is a wrapper around Prometheus Registry registry k8smetrics.KubeRegistry - // opLatencyMetrics is a Histogram metrics + // opLatencyMetrics is a Histogram metrics for operation time per request opLatencyMetrics *k8smetrics.HistogramVec } +// NewMetricsManager creates a new MetricsManager instance func NewMetricsManager() MetricsManager { mgr := &operationMetricsManager{ cache: sync.Map{}, @@ -105,34 +170,83 @@ func NewMetricsManager() MetricsManager { return mgr } -func (opMgr *operationMetricsManager) OperationStart(op Operation) { - opMgr.cache.LoadOrStore(op, time.Now()) +// OperationStart starts a new operation +func (opMgr *operationMetricsManager) OperationStart(key OperationKey, val OperationValue) { + val.startTime = time.Now() + opMgr.cache.LoadOrStore(key, val) } -func (opMgr *operationMetricsManager) DropOperation(op Operation) { +// OperationStart drops an operation +func (opMgr *operationMetricsManager) DropOperation(op OperationKey) { opMgr.cache.Delete(op) } -func (opMgr *operationMetricsManager) RecordMetrics(op Operation, status OperationStatus) { - obj, exists := opMgr.cache.Load(op) +// RecordMetrics emits operation metrics +func (opMgr *operationMetricsManager) RecordMetrics(opKey OperationKey, opStatus OperationStatus, driverName string) { + obj, exists := opMgr.cache.Load(opKey) if !exists { // the operation has not been cached, return directly return } - ts, ok := obj.(time.Time) + opVal, ok := obj.(OperationValue) if !ok { - // the cached item is not a time.Time, should NEVER happen, clean and return - klog.Errorf("Invalid cache entry for key %v", op) - opMgr.cache.Delete(op) + // the cached item is not a OperationValue, should NEVER happen, clean and return + klog.Errorf("Invalid cache entry for key %v", opKey) + opMgr.cache.Delete(opKey) return } - strStatus := opStatusUnknown - if status != nil { - strStatus = status.String() + status := string(SnapshotStatusTypeUnknown) + if opStatus != nil { + status = opStatus.String() } - duration := time.Since(ts).Seconds() - opMgr.opLatencyMetrics.WithLabelValues(op.Driver, op.Name, strStatus).Observe(duration) - opMgr.cache.Delete(op) + + // if we do not know the driverName while recording metrics, + // refer to the cached version instead. + if driverName == "" || driverName == unknownDriverName { + driverName = opVal.Driver + } + + operationDuration := time.Since(opVal.startTime).Seconds() + opMgr.opLatencyMetrics.WithLabelValues(driverName, opKey.Name, opVal.SnapshotType, status).Observe(operationDuration) + + // Report cancel metrics if we are deleting an unfinished VolumeSnapshot + if opKey.Name == DeleteSnapshotOperationName { + // check if we have a CreateSnapshot operation pending for this + createKey := NewOperationKey(CreateSnapshotOperationName, opKey.ResourceID) + obj, exists := opMgr.cache.Load(createKey) + if exists { + // record a cancel metric if found + opMgr.recordCancelMetric(obj, createKey, operationDuration) + } + + // check if we have a CreateSnapshotAndReady operation pending for this + createAndReadyKey := NewOperationKey(CreateSnapshotAndReadyOperationName, opKey.ResourceID) + obj, exists = opMgr.cache.Load(createAndReadyKey) + if exists { + // record a cancel metric if found + opMgr.recordCancelMetric(obj, createAndReadyKey, operationDuration) + } + } + + opMgr.cache.Delete(opKey) +} + +// recordCancelMetric records a metric for a create operation that hasn't finished +func (opMgr *operationMetricsManager) recordCancelMetric(obj interface{}, key OperationKey, duration float64) { + // record a cancel metric if found + val, ok := obj.(OperationValue) + if !ok { + klog.Errorf("Invalid cache entry for key %v", key) + opMgr.cache.Delete(key) + return + } + opMgr.opLatencyMetrics.WithLabelValues( + val.Driver, + key.Name, + val.SnapshotType, + string(SnapshotStatusTypeCancel), + ).Observe(duration) + opMgr.cache.Delete(key) } func (opMgr *operationMetricsManager) init() { @@ -140,11 +254,11 @@ func (opMgr *operationMetricsManager) init() { opMgr.opLatencyMetrics = k8smetrics.NewHistogramVec( &k8smetrics.HistogramOpts{ Subsystem: subSystem, - Name: metricName, - Help: metricHelpMsg, + Name: operationLatencyMetricName, + Help: operationLatencyMetricHelpMsg, Buckets: metricBuckets, }, - []string{labelDriverName, labelOperationName, labelOperationStatus}, + []string{labelDriverName, labelOperationName, labelSnapshotType, labelOperationStatus}, ) opMgr.registry.MustRegister(opMgr.opLatencyMetrics) } @@ -175,3 +289,25 @@ func (opMgr *operationMetricsManager) StartMetricsEndpoint(pattern, addr string, }() return srv, nil } + +// snapshotProvisionType represents which kind of snapshot a metric is +type snapshotProvisionType string + +// snapshotStatusType represents the type of snapshot status to report +type snapshotStatusType string + +// SnapshotOperationStatus represents the status for a snapshot controller operation +type SnapshotOperationStatus struct { + status snapshotStatusType +} + +// NewSnapshotOperationStatus returns a new SnapshotOperationStatus +func NewSnapshotOperationStatus(status snapshotStatusType) SnapshotOperationStatus { + return SnapshotOperationStatus{ + status: status, + } +} + +func (sos SnapshotOperationStatus) String() string { + return string(sos.status) +} diff --git a/pkg/metrics/metrics_test.go b/pkg/metrics/metrics_test.go index 0360e928c..aa766fb14 100644 --- a/pkg/metrics/metrics_test.go +++ b/pkg/metrics/metrics_test.go @@ -19,10 +19,12 @@ package metrics import ( "context" "fmt" + "io" "io/ioutil" "log" "net/http" "reflect" + "sort" "strings" "sync" "testing" @@ -86,9 +88,8 @@ func TestNew(t *testing.T) { func TestDropNonExistingOperation(t *testing.T) { mgr, wg, srv := initMgr() defer shutdown(srv, wg) - op := Operation{ + op := OperationKey{ Name: "drop-non-existing-operation-should-be-noop", - Driver: "driver", ResourceID: types.UID("uid"), } mgr.DropOperation(op) @@ -98,12 +99,11 @@ func TestRecordMetricsForNonExistingOperation(t *testing.T) { mgr, wg, srv := initMgr() srvAddr := "http://" + srv.Addr + httpPattern defer shutdown(srv, wg) - op := Operation{ + opKey := OperationKey{ Name: "no-metrics-should-be-recorded-as-operation-did-not-start", - Driver: "driver", ResourceID: types.UID("uid"), } - mgr.RecordMetrics(op, nil) + mgr.RecordMetrics(opKey, nil, "driver") rsp, err := http.Get(srvAddr) if err != nil || rsp.StatusCode != http.StatusOK { t.Errorf("failed to get response from server %v, %v", err, rsp) @@ -112,8 +112,8 @@ func TestRecordMetricsForNonExistingOperation(t *testing.T) { if err != nil { t.Errorf("failed to read response body %v", err) } - if strings.Contains(string(r), op.Name) { - t.Errorf("found metric should have been dropped for operation [%s] [%s]", op.Name, string(r)) + if strings.Contains(string(r), opKey.Name) { + t.Errorf("found metric should have been dropped for operation [%s] [%s]", opKey.Name, string(r)) } } @@ -121,13 +121,13 @@ func TestDropOperation(t *testing.T) { mgr, wg, srv := initMgr() srvAddr := "http://" + srv.Addr + httpPattern defer shutdown(srv, wg) - op := Operation{ + opKey := OperationKey{ Name: "should-have-been-dropped", - Driver: "driver", ResourceID: types.UID("uid"), } - mgr.OperationStart(op) - mgr.DropOperation(op) + opVal := NewOperationValue("driver", DynamicSnapshotType) + mgr.OperationStart(opKey, opVal) + mgr.DropOperation(opKey) time.Sleep(300 * time.Millisecond) rsp, err := http.Get(srvAddr) if err != nil || rsp.StatusCode != http.StatusOK { @@ -137,36 +137,36 @@ func TestDropOperation(t *testing.T) { if err != nil { t.Errorf("failed to read response body %v", err) } - if strings.Contains(string(r), op.Name) { - t.Errorf("found metric should have been dropped for operation [%s] [%s]", op.Name, string(r)) + if strings.Contains(string(r), opKey.Name) { + t.Errorf("found metric should have been dropped for operation [%s] [%s]", opKey.Name, string(r)) } // re-add with a different name - op.Name = "should-have-been-added" - mgr.OperationStart(op) + opKey.Name = "should-have-been-added" + mgr.OperationStart(opKey, opVal) time.Sleep(300 * time.Millisecond) opStatus := &fakeOpStatus{ statusCode: 0, } - mgr.RecordMetrics(op, opStatus) + mgr.RecordMetrics(opKey, opStatus, "driver") expected := `# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end # TYPE snapshot_controller_operation_total_seconds histogram -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="0.1"} 0 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="0.25"} 0 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="0.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="1"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="2.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="10"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="15"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="30"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="60"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="120"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="300"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="600"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",le="+Inf"} 1 -snapshot_controller_operation_total_seconds_sum{driver_name="driver",operation_name="should-have-been-added",operation_status="Success"} 0.3 -snapshot_controller_operation_total_seconds_count{driver_name="driver",operation_name="should-have-been-added",operation_status="Success"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="0.1"} 0 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="0.25"} 0 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="0.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="1"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="2.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="10"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="15"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="30"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="60"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="120"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="300"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="600"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type="",le="+Inf"} 1 +snapshot_controller_operation_total_seconds_sum{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type=""} 0.3 +snapshot_controller_operation_total_seconds_count{driver_name="driver",operation_name="should-have-been-added",operation_status="Success",snapshot_type=""} 1 ` if err := verifyMetric(expected, srvAddr); err != nil { @@ -178,34 +178,33 @@ func TestUnknownStatus(t *testing.T) { mgr, wg, srv := initMgr() srvAddr := "http://" + srv.Addr + httpPattern defer shutdown(srv, wg) - op := Operation{ + opKey := OperationKey{ Name: "unknown-status-operation", - Driver: "driver", ResourceID: types.UID("uid"), } - mgr.OperationStart(op) + mgr.OperationStart(opKey, NewOperationValue("driver", DynamicSnapshotType)) // should create a Unknown data point with latency ~300ms time.Sleep(300 * time.Millisecond) - mgr.RecordMetrics(op, nil) + mgr.RecordMetrics(opKey, nil, "driver") expected := `# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end # TYPE snapshot_controller_operation_total_seconds histogram -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="0.1"} 0 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="0.25"} 0 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="0.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="1"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="2.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="10"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="15"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="30"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="60"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="120"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="300"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="600"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",le="+Inf"} 1 -snapshot_controller_operation_total_seconds_sum{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown"} 0.3 -snapshot_controller_operation_total_seconds_count{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="0.1"} 0 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="0.25"} 0 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="0.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="1"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="2.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="10"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="15"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="30"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="60"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="120"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="300"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="600"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type="",le="+Inf"} 1 +snapshot_controller_operation_total_seconds_sum{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type=""} 0.3 +snapshot_controller_operation_total_seconds_count{driver_name="driver",operation_name="unknown-status-operation",operation_status="Unknown",snapshot_type=""} 1 ` if err := verifyMetric(expected, srvAddr); err != nil { @@ -218,66 +217,65 @@ func TestRecordMetrics(t *testing.T) { srvAddr := "http://" + srv.Addr + httpPattern defer shutdown(srv, wg) // add an operation - op := Operation{ + opKey := OperationKey{ Name: "op1", - Driver: "driver1", ResourceID: types.UID("uid1"), } - mgr.OperationStart(op) + opVal := NewOperationValue("driver", DynamicSnapshotType) + mgr.OperationStart(opKey, opVal) // should create a Success data point with latency ~ 1100ms time.Sleep(1100 * time.Millisecond) success := &fakeOpStatus{ statusCode: 0, } - mgr.RecordMetrics(op, success) + mgr.RecordMetrics(opKey, success, "driver") // add another operation metric - op.Name = "op2" - op.Driver = "driver2" - op.ResourceID = types.UID("uid2") - mgr.OperationStart(op) + opKey.Name = "op2" + opKey.ResourceID = types.UID("uid2") + mgr.OperationStart(opKey, opVal) // should create a Failure data point with latency ~ 100ms time.Sleep(100 * time.Millisecond) failure := &fakeOpStatus{ statusCode: 1, } - mgr.RecordMetrics(op, failure) + mgr.RecordMetrics(opKey, failure, "driver") expected := `# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end # TYPE snapshot_controller_operation_total_seconds histogram -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="0.1"} 0 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="0.25"} 0 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="0.5"} 0 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="1"} 0 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="2.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="10"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="15"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="30"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="60"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="120"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="300"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="600"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",le="+Inf"} 1 -snapshot_controller_operation_total_seconds_sum{driver_name="driver1",operation_name="op1",operation_status="Success"} 1.1 -snapshot_controller_operation_total_seconds_count{driver_name="driver1",operation_name="op1",operation_status="Success"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="0.1"} 0 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="0.25"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="0.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="1"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="2.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="10"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="15"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="30"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="60"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="120"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="300"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="600"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",le="+Inf"} 1 -snapshot_controller_operation_total_seconds_sum{driver_name="driver2",operation_name="op2",operation_status="Failure"} 0.1 -snapshot_controller_operation_total_seconds_count{driver_name="driver2",operation_name="op2",operation_status="Failure"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="0.1"} 0 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="0.25"} 0 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="0.5"} 0 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="1"} 0 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="2.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="10"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="15"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="30"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="60"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="120"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="300"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="600"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type="",le="+Inf"} 1 +snapshot_controller_operation_total_seconds_sum{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type=""} 1.1 +snapshot_controller_operation_total_seconds_count{driver_name="driver1",operation_name="op1",operation_status="Success",snapshot_type=""} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="0.1"} 0 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="0.25"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="0.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="1"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="2.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="10"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="15"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="30"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="60"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="120"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="300"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="600"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type="",le="+Inf"} 1 +snapshot_controller_operation_total_seconds_sum{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type=""} 0.1 +snapshot_controller_operation_total_seconds_count{driver_name="driver2",operation_name="op2",operation_status="Failure",snapshot_type=""} 1 ` if err := verifyMetric(expected, srvAddr); err != nil { t.Errorf("failed testing [%v]", err) @@ -295,15 +293,14 @@ func TestConcurrency(t *testing.T) { statusCode: 1, } ops := []struct { - op Operation + op OperationKey desiredLatencyMs time.Duration status OperationStatus drop bool }{ { - Operation{ + OperationKey{ Name: "success1", - Driver: "driver1", ResourceID: types.UID("uid1"), }, 100, @@ -311,9 +308,8 @@ func TestConcurrency(t *testing.T) { false, }, { - Operation{ + OperationKey{ Name: "success2", - Driver: "driver2", ResourceID: types.UID("uid2"), }, 100, @@ -321,9 +317,8 @@ func TestConcurrency(t *testing.T) { false, }, { - Operation{ + OperationKey{ Name: "failure1", - Driver: "driver3", ResourceID: types.UID("uid3"), }, 100, @@ -331,9 +326,8 @@ func TestConcurrency(t *testing.T) { false, }, { - Operation{ + OperationKey{ Name: "failure2", - Driver: "driver4", ResourceID: types.UID("uid4"), }, 100, @@ -341,9 +335,8 @@ func TestConcurrency(t *testing.T) { false, }, { - Operation{ + OperationKey{ Name: "unknown", - Driver: "driver5", ResourceID: types.UID("uid5"), }, 100, @@ -351,9 +344,8 @@ func TestConcurrency(t *testing.T) { false, }, { - Operation{ + OperationKey{ Name: "drop-from-cache", - Driver: "driver6", ResourceID: types.UID("uid6"), }, 100, @@ -363,15 +355,20 @@ func TestConcurrency(t *testing.T) { } for i := range ops { - mgr.OperationStart(ops[i].op) + mgr.OperationStart(ops[i].op, OperationValue{ + Driver: fmt.Sprintf("driver%v", i), + SnapshotType: string(DynamicSnapshotType), + }) } // add an extra operation which should remain in the cache - remaining := Operation{ + remaining := OperationKey{ Name: "remaining-in-cache", - Driver: "driver7", ResourceID: types.UID("uid7"), } - mgr.OperationStart(remaining) + mgr.OperationStart(remaining, OperationValue{ + Driver: "driver7", + SnapshotType: string(DynamicSnapshotType), + }) var wgMetrics sync.WaitGroup @@ -385,7 +382,7 @@ func TestConcurrency(t *testing.T) { if ops[i].drop { mgr.DropOperation(ops[i].op) } else { - mgr.RecordMetrics(ops[i].op, ops[i].status) + mgr.RecordMetrics(ops[i].op, ops[i].status, fmt.Sprintf("driver%v", i)) } }(i) } @@ -395,86 +392,86 @@ func TestConcurrency(t *testing.T) { expected := `# HELP snapshot_controller_operation_total_seconds [ALPHA] Total number of seconds spent by the controller on an operation from end to end # TYPE snapshot_controller_operation_total_seconds histogram -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="0.1"} 0 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="0.25"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="0.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="1"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="2.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="10"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="15"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="30"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="60"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="120"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="300"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="600"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",le="+Inf"} 1 -snapshot_controller_operation_total_seconds_sum{driver_name="driver1",operation_name="success1",operation_status="Success"} 0.1 -snapshot_controller_operation_total_seconds_count{driver_name="driver1",operation_name="success1",operation_status="Success"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="0.1"} 0 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="0.25"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="0.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="1"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="2.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="10"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="15"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="30"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="60"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="120"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="300"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="600"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",le="+Inf"} 1 -snapshot_controller_operation_total_seconds_sum{driver_name="driver2",operation_name="success2",operation_status="Success"} 0.1 -snapshot_controller_operation_total_seconds_count{driver_name="driver2",operation_name="success2",operation_status="Success"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="0.1"} 0 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="0.25"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="0.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="1"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="2.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="10"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="15"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="30"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="60"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="120"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="300"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="600"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",le="+Inf"} 1 -snapshot_controller_operation_total_seconds_sum{driver_name="driver3",operation_name="failure1",operation_status="Failure"} 0.1 -snapshot_controller_operation_total_seconds_count{driver_name="driver3",operation_name="failure1",operation_status="Failure"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="0.1"} 0 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="0.25"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="0.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="1"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="2.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="10"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="15"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="30"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="60"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="120"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="300"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="600"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",le="+Inf"} 1 -snapshot_controller_operation_total_seconds_sum{driver_name="driver4",operation_name="failure2",operation_status="Failure"} 0.1 -snapshot_controller_operation_total_seconds_count{driver_name="driver4",operation_name="failure2",operation_status="Failure"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="0.1"} 0 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="0.25"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="0.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="1"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="2.5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="5"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="10"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="15"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="30"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="60"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="120"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="300"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="600"} 1 -snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",le="+Inf"} 1 -snapshot_controller_operation_total_seconds_sum{driver_name="driver5",operation_name="unknown",operation_status="Unknown"} 0.1 -snapshot_controller_operation_total_seconds_count{driver_name="driver5",operation_name="unknown",operation_status="Unknown"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="0.1"} 0 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="0.25"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="0.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="1"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="2.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="10"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="15"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="30"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="60"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="120"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="300"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="600"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type="",le="+Inf"} 1 +snapshot_controller_operation_total_seconds_sum{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type=""} 0.1 +snapshot_controller_operation_total_seconds_count{driver_name="driver1",operation_name="success1",operation_status="Success",snapshot_type=""} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="0.1"} 0 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="0.25"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="0.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="1"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="2.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="10"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="15"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="30"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="60"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="120"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="300"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="600"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type="",le="+Inf"} 1 +snapshot_controller_operation_total_seconds_sum{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type=""} 0.1 +snapshot_controller_operation_total_seconds_count{driver_name="driver2",operation_name="success2",operation_status="Success",snapshot_type=""} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="0.1"} 0 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="0.25"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="0.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="1"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="2.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="10"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="15"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="30"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="60"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="120"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="300"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="600"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type="",le="+Inf"} 1 +snapshot_controller_operation_total_seconds_sum{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type=""} 0.1 +snapshot_controller_operation_total_seconds_count{driver_name="driver3",operation_name="failure1",operation_status="Failure",snapshot_type=""} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="0.1"} 0 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="0.25"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="0.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="1"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="2.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="10"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="15"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="30"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="60"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="120"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="300"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="600"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type="",le="+Inf"} 1 +snapshot_controller_operation_total_seconds_sum{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type=""} 0.1 +snapshot_controller_operation_total_seconds_count{driver_name="driver4",operation_name="failure2",operation_status="Failure",snapshot_type=""} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="0.1"} 0 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="0.25"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="0.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="1"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="2.5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="5"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="10"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="15"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="30"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="60"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="120"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="300"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="600"} 1 +snapshot_controller_operation_total_seconds_bucket{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type="",le="+Inf"} 1 +snapshot_controller_operation_total_seconds_sum{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type=""} 0.1 +snapshot_controller_operation_total_seconds_count{driver_name="driver5",operation_name="unknown",operation_status="Unknown",snapshot_type=""} 1 ` if err := verifyMetric(expected, srvAddr); err != nil { t.Errorf("failed testing [%v]", err) @@ -494,63 +491,141 @@ func verifyMetric(expected, srvAddr string) error { return err } format := expfmt.ResponseFormat(rsp.Header) - reader := strings.NewReader(string(r)) - decoder := expfmt.NewDecoder(reader, format) - mf := &cmg.MetricFamily{} - if err := decoder.Decode(mf); err != nil { - return err - } - reader = strings.NewReader(expected) - decoder = expfmt.NewDecoder(reader, format) - expectedMf := &cmg.MetricFamily{} - if err := decoder.Decode(expectedMf); err != nil { - return err + gotReader := strings.NewReader(string(r)) + gotDecoder := expfmt.NewDecoder(gotReader, format) + expectedReader := strings.NewReader(expected) + expectedDecoder := expfmt.NewDecoder(expectedReader, format) + + gotMfs := []*cmg.MetricFamily{} + expectedMfs := []*cmg.MetricFamily{} + for { + gotMf := &cmg.MetricFamily{} + gotErr := gotDecoder.Decode(gotMf) + expectedMf := &cmg.MetricFamily{} + if expectedErr := expectedDecoder.Decode(expectedMf); expectedErr != nil { + // return correctly if both are EOF + if expectedErr == io.EOF && gotErr == io.EOF { + break + } else { + return err + } + } + gotMfs = append(gotMfs, gotMf) + expectedMfs = append(expectedMfs, expectedMf) } - if !containsMetrics(expectedMf, mf) { + + if !containsMetrics(expectedMfs, gotMfs) { return fmt.Errorf("failed testing, expected\n%s\n, got\n%s\n", expected, string(r)) } + return nil } -func containsMetrics(expected, got *cmg.MetricFamily) bool { - if (got.Name == nil || *(got.Name) != *(expected.Name)) || - (got.Type == nil || *(got.Type) != *(expected.Type)) || - (got.Help == nil || *(got.Help) != *(expected.Help)) { - return false - } +// sortMfs, sorts metric families in alphabetical order by type. +// currently only supports counter and histogram +func sortMfs(mfs []*cmg.MetricFamily) []*cmg.MetricFamily { + var sortedMfs []*cmg.MetricFamily + + // Sort first by type + sort.Slice(mfs, func(i, j int) bool { + return *mfs[i].Type < *mfs[j].Type + }) - numRecords := len(expected.Metric) - if len(got.Metric) < numRecords { + // Next, sort by length of name + sort.Slice(mfs, func(i, j int) bool { + return len(*mfs[i].Name) < len(*mfs[j].Name) + }) + + return sortedMfs +} + +func containsMetrics(expectedMfs, gotMfs []*cmg.MetricFamily) bool { + if len(gotMfs) != len(expectedMfs) { + fmt.Printf("Not same length: expected and got metrics families: %v vs. %v\n", len(expectedMfs), len(gotMfs)) return false } - matchCount := 0 - for i := 0; i < len(got.Metric); i++ { - for j := 0; j < numRecords; j++ { - // labels should be the same - if !reflect.DeepEqual(got.Metric[i].Label, expected.Metric[j].Label) { - continue - } - gh := got.Metric[i].Histogram - eh := expected.Metric[j].Histogram - if gh == nil { - continue - } - if !reflect.DeepEqual(gh.Bucket, eh.Bucket) { - continue - } - // this is a sum record, expecting a latency which is more than the - // expected one. If the sum is smaller than expected, it will be considered - // as NOT a match - if gh.SampleSum == nil || *(gh.SampleSum) < *(eh.SampleSum) { - continue - } - if gh.SampleCount == nil || *(gh.SampleCount) != *(eh.SampleCount) { - continue + + // sort metric families for deterministic comparison. + sortedExpectedMfs := sortMfs(expectedMfs) + sortedGotMfs := sortMfs(gotMfs) + + // compare expected vs. sorted actual metrics + for k, got := range sortedGotMfs { + matchCount := 0 + expected := sortedExpectedMfs[k] + + if (got.Name == nil || *(got.Name) != *(expected.Name)) || + (got.Type == nil || *(got.Type) != *(expected.Type)) || + (got.Help == nil || *(got.Help) != *(expected.Help)) { + fmt.Printf("invalid header info: got: %v, expected: %v\n", *got.Name, *expected.Name) + fmt.Printf("invalid header info: got: %v, expected: %v\n", *got.Type, *expected.Type) + fmt.Printf("invalid header info: got: %v, expected: %v\n", *got.Help, *expected.Help) + return false + } + + numRecords := len(expected.Metric) + if len(got.Metric) < numRecords { + fmt.Printf("Not the same number of records: got.Metric: %v, numRecords: %v\n", len(got.Metric), numRecords) + return false + } + for i := 0; i < len(got.Metric); i++ { + for j := 0; j < numRecords; j++ { + if got.Metric[i].Histogram == nil && expected.Metric[j].Histogram != nil || + got.Metric[i].Histogram != nil && expected.Metric[j].Histogram == nil { + fmt.Printf("got metric and expected metric histogram type mismatch") + return false + } + + // labels should be the same + if !reflect.DeepEqual(got.Metric[i].Label, expected.Metric[j].Label) { + continue + } + + // metric type specific checks + switch { + case got.Metric[i].Histogram != nil && expected.Metric[j].Histogram != nil: + gh := got.Metric[i].Histogram + eh := expected.Metric[j].Histogram + if gh == nil || eh == nil { + continue + } + if !reflect.DeepEqual(gh.Bucket, eh.Bucket) { + fmt.Println("got and expected histogram bucket not equal") + continue + } + + // this is a sum record, expecting a latency which is more than the + // expected one. If the sum is smaller than expected, it will be considered + // as NOT a match + if gh.SampleSum == nil || eh.SampleSum == nil || *(gh.SampleSum) < *(eh.SampleSum) { + fmt.Println("difference in sample sum") + continue + } + if gh.SampleCount == nil || eh.SampleCount == nil || *(gh.SampleCount) != *(eh.SampleCount) { + fmt.Println("difference in sample count") + continue + } + + case got.Metric[i].Counter != nil && expected.Metric[j].Counter != nil: + gc := got.Metric[i].Counter + ec := expected.Metric[j].Counter + if gc.Value == nil || *(gc.Value) != *(ec.Value) { + fmt.Println("difference in counter values") + continue + } + } + + // this is a match + matchCount = matchCount + 1 + break } - // this is a match - matchCount = matchCount + 1 - break + } + + if matchCount != numRecords { + fmt.Printf("matchCount %v, numRecords %v\n", matchCount, numRecords) + return false } } - return matchCount == numRecords + + return true } diff --git a/pkg/utils/util.go b/pkg/utils/util.go index c7c52086a..51111591d 100644 --- a/pkg/utils/util.go +++ b/pkg/utils/util.go @@ -513,3 +513,8 @@ func IsSnapshotReady(snapshot *crdv1.VolumeSnapshot) bool { } return true } + +// IsSnapshotCreated indicates that the snapshot has been cut on a storage system +func IsSnapshotCreated(snapshot *crdv1.VolumeSnapshot) bool { + return snapshot.Status != nil && snapshot.Status.CreationTime != nil +} diff --git a/pkg/validation-webhook/certwatcher.go b/pkg/validation-webhook/certwatcher.go index d79908df7..6ea928317 100644 --- a/pkg/validation-webhook/certwatcher.go +++ b/pkg/validation-webhook/certwatcher.go @@ -22,7 +22,7 @@ import ( "sync" "github.com/fsnotify/fsnotify" - "k8s.io/klog" + "k8s.io/klog/v2" ) // This file originated from github.com/kubernetes-sigs/controller-runtime/pkg/webhook/internal/certwatcher. diff --git a/vendor/github.com/kubernetes-csi/csi-lib-utils/connection/connection.go b/vendor/github.com/kubernetes-csi/csi-lib-utils/connection/connection.go index c78f12bfc..ac07d59be 100644 --- a/vendor/github.com/kubernetes-csi/csi-lib-utils/connection/connection.go +++ b/vendor/github.com/kubernetes-csi/csi-lib-utils/connection/connection.go @@ -27,7 +27,7 @@ import ( "github.com/kubernetes-csi/csi-lib-utils/metrics" "github.com/kubernetes-csi/csi-lib-utils/protosanitizer" "google.golang.org/grpc" - "k8s.io/klog" + "k8s.io/klog/v2" ) const ( diff --git a/vendor/github.com/kubernetes-csi/csi-lib-utils/leaderelection/leader_election.go b/vendor/github.com/kubernetes-csi/csi-lib-utils/leaderelection/leader_election.go index 25fdf70a9..5db61ca18 100644 --- a/vendor/github.com/kubernetes-csi/csi-lib-utils/leaderelection/leader_election.go +++ b/vendor/github.com/kubernetes-csi/csi-lib-utils/leaderelection/leader_election.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "io/ioutil" + "net/http" "os" "regexp" "strings" @@ -32,13 +33,21 @@ import ( "k8s.io/client-go/tools/leaderelection" "k8s.io/client-go/tools/leaderelection/resourcelock" "k8s.io/client-go/tools/record" - "k8s.io/klog" + "k8s.io/klog/v2" ) const ( defaultLeaseDuration = 15 * time.Second defaultRenewDeadline = 10 * time.Second defaultRetryPeriod = 5 * time.Second + + DefaultHealthCheckTimeout = 20 * time.Second + + // HealthCheckerAddress is the address at which the leader election health + // checker reports status. + // The caller sidecar should document this address in appropriate flag + // descriptions. + HealthCheckerAddress = "/healthz/leader-election" ) // leaderElection is a convenience wrapper around client-go's leader election library. @@ -55,6 +64,9 @@ type leaderElection struct { // valid options are resourcelock.LeasesResourceLock, resourcelock.EndpointsResourceLock, // and resourcelock.ConfigMapsResourceLock resourceLock string + // healthCheck reports unhealthy if leader election fails to renew leadership + // within a timeout period. + healthCheck *leaderelection.HealthzAdaptor leaseDuration time.Duration renewDeadline time.Duration @@ -134,6 +146,27 @@ func (l *leaderElection) WithContext(ctx context.Context) { l.ctx = ctx } +// Server represents any type that could serve HTTP requests for the leader +// election health check endpoint. +type Server interface { + Handle(pattern string, handler http.Handler) +} + +// PrepareHealthCheck creates a health check for this leader election object +// with the given healthCheckTimeout and registers its HTTP handler to the given +// server at the path specified by the constant "healthCheckerAddress". +// healthCheckTimeout determines the max duration beyond lease expiration +// allowed before reporting unhealthy. +// The caller sidecar should document the handler address in appropriate flag +// descriptions. +func (l *leaderElection) PrepareHealthCheck( + s Server, + healthCheckTimeout time.Duration) { + + l.healthCheck = leaderelection.NewLeaderHealthzAdaptor(healthCheckTimeout) + s.Handle(HealthCheckerAddress, adaptCheckToHandler(l.healthCheck.Check)) +} + func (l *leaderElection) Run() error { if l.identity == "" { id, err := defaultLeaderElectionIdentity() @@ -179,6 +212,7 @@ func (l *leaderElection) Run() error { klog.V(3).Infof("new leader detected, current leader: %s", identity) }, }, + WatchDog: l.healthCheck, } ctx := l.ctx @@ -220,3 +254,15 @@ func inClusterNamespace() string { return "default" } + +// adaptCheckToHandler returns an http.HandlerFunc that serves the provided checks. +func adaptCheckToHandler(c func(r *http.Request) error) http.HandlerFunc { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + err := c(r) + if err != nil { + http.Error(w, fmt.Sprintf("internal server error: %v", err), http.StatusInternalServerError) + } else { + fmt.Fprint(w, "ok") + } + }) +} diff --git a/vendor/github.com/kubernetes-csi/csi-lib-utils/metrics/metrics.go b/vendor/github.com/kubernetes-csi/csi-lib-utils/metrics/metrics.go index c18ab160a..495e1ff85 100644 --- a/vendor/github.com/kubernetes-csi/csi-lib-utils/metrics/metrics.go +++ b/vendor/github.com/kubernetes-csi/csi-lib-utils/metrics/metrics.go @@ -27,7 +27,6 @@ import ( "google.golang.org/grpc/codes" "google.golang.org/grpc/status" "k8s.io/component-base/metrics" - "k8s.io/klog" ) const ( @@ -90,10 +89,15 @@ type CSIMetricsManager interface { // driverName - Name of the CSI driver against which this operation was executed. SetDriverName(driverName string) - // StartMetricsEndpoint starts the metrics endpoint at the specified address/path - // for this metrics manager. - // If the metricsAddress is an empty string, this will be a no op. - StartMetricsEndpoint(metricsAddress, metricsPath string) + // RegisterToServer registers an HTTP handler for this metrics manager to the + // given server at the specified address/path. + RegisterToServer(s Server, metricsPath string) +} + +// Server represents any type that could serve HTTP requests for the metrics +// endpoint. +type Server interface { + Handle(pattern string, handler http.Handler) } // MetricsManagerOption is used to pass optional configuration to a @@ -145,6 +149,22 @@ func WithLabels(labels map[string]string) MetricsManagerOption { } } +// WithProcessStartTime controlls whether process_start_time_seconds is registered +// in the registry of the metrics manager. It's enabled by default out of convenience +// (no need to do anything special in most sidecars) but should be disabled in more +// complex scenarios (more than one metrics manager per process, metric already +// provided elsewhere like via the Prometheus Golang collector). +// +// In particular, registering this metric via metric manager and thus the Kubernetes +// component base conflicts with the Prometheus Golang collector (gathered metric family +// process_start_time_seconds has help "[ALPHA] Start time of the process since unix epoch in seconds." +// but should have "Start time of the process since unix epoch in seconds." +func WithProcessStartTime(registerProcessStartTime bool) MetricsManagerOption { + return func(cmm *csiMetricsManager) { + cmm.registerProcessStartTime = registerProcessStartTime + } +} + // NewCSIMetricsManagerForSidecar creates and registers metrics for CSI Sidecars and // returns an object that can be used to trigger the metrics. It uses "csi_sidecar" // as subsystem. @@ -177,18 +197,27 @@ func NewCSIMetricsManagerForPlugin(driverName string) CSIMetricsManager { // If unknown, leave empty, and use SetDriverName method to update later. func NewCSIMetricsManagerWithOptions(driverName string, options ...MetricsManagerOption) CSIMetricsManager { cmm := csiMetricsManager{ - registry: metrics.NewKubeRegistry(), - subsystem: SubsystemSidecar, - stabilityLevel: metrics.ALPHA, + registry: metrics.NewKubeRegistry(), + subsystem: SubsystemSidecar, + stabilityLevel: metrics.ALPHA, + registerProcessStartTime: true, } - // https://github.com/open-telemetry/opentelemetry-collector/issues/969 - // Add process_start_time_seconds into the metric to let the start time be parsed correctly - metrics.RegisterProcessStartTime(cmm.registry.Register) - for _, option := range options { option(&cmm) } + + if cmm.registerProcessStartTime { + // https://github.com/open-telemetry/opentelemetry-collector/issues/969 + // Add process_start_time_seconds into the metric to let the start time be parsed correctly + metrics.RegisterProcessStartTime(cmm.registry.Register) + // TODO: This is a bug in component-base library. We need to remove this after upgrade component-base dependency + // BugFix: https://github.com/kubernetes/kubernetes/pull/96435 + // The first call to RegisterProcessStartTime can only create the metric, so we need a second call to actually + // register the metric. + metrics.RegisterProcessStartTime(cmm.registry.Register) + } + labels := []string{labelCSIDriverName, labelCSIOperationName, labelGrpcStatusCode} labels = append(labels, cmm.additionalLabelNames...) for _, label := range cmm.additionalLabels { @@ -219,6 +248,7 @@ type csiMetricsManager struct { additionalLabelNames []string additionalLabels []label csiOperationsLatencyMetric *metrics.HistogramVec + registerProcessStartTime bool } type label struct { @@ -325,27 +355,13 @@ func (cmm *csiMetricsManager) SetDriverName(driverName string) { } } -// StartMetricsEndpoint starts the metrics endpoint at the specified address/path -// for this metrics manager on a new go routine. -// If the metricsAddress is an empty string, this will be a no op. -func (cmm *csiMetricsManager) StartMetricsEndpoint(metricsAddress, metricsPath string) { - if metricsAddress == "" { - klog.Warningf("metrics endpoint will not be started because `metrics-address` was not specified.") - return - } - - http.Handle(metricsPath, metrics.HandlerFor( +// RegisterToServer registers an HTTP handler for this metrics manager to the +// given server at the specified address/path. +func (cmm *csiMetricsManager) RegisterToServer(s Server, metricsPath string) { + s.Handle(metricsPath, metrics.HandlerFor( cmm.GetRegistry(), metrics.HandlerOpts{ ErrorHandling: metrics.ContinueOnError})) - - // Spawn a new go routine to listen on specified endpoint - go func() { - err := http.ListenAndServe(metricsAddress, nil) - if err != nil { - klog.Fatalf("Failed to start prometheus metrics endpoint on specified address (%q) and path (%q): %s", metricsAddress, metricsPath, err) - } - }() } // VerifyMetricsMatch is a helper function that verifies that the expected and diff --git a/vendor/github.com/kubernetes-csi/csi-lib-utils/rpc/common.go b/vendor/github.com/kubernetes-csi/csi-lib-utils/rpc/common.go index bb4a5c448..9dcb3534b 100644 --- a/vendor/github.com/kubernetes-csi/csi-lib-utils/rpc/common.go +++ b/vendor/github.com/kubernetes-csi/csi-lib-utils/rpc/common.go @@ -27,7 +27,7 @@ import ( "github.com/container-storage-interface/spec/lib/go/csi" - "k8s.io/klog" + "k8s.io/klog/v2" ) const ( diff --git a/vendor/modules.txt b/vendor/modules.txt index 22d6723be..4dcedd29b 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -57,7 +57,7 @@ github.com/imdario/mergo github.com/inconshreveable/mousetrap # github.com/json-iterator/go v1.1.10 github.com/json-iterator/go -# github.com/kubernetes-csi/csi-lib-utils v0.8.1 +# github.com/kubernetes-csi/csi-lib-utils v0.9.0 ## explicit github.com/kubernetes-csi/csi-lib-utils/connection github.com/kubernetes-csi/csi-lib-utils/leaderelection