From 76d8e7c11b65db4c77c9487a8db223fee8dda414 Mon Sep 17 00:00:00 2001 From: Hanchi Zhang Date: Fri, 10 Oct 2025 03:36:59 +0000 Subject: [PATCH] feat: Instrument unexpected device path changes Adds a new metric, `unexpected_device_path_changes`, to track unexpected device path changes. This metric is incremented whenever the device path for a volume changes, as detected by the linkcache's periodic check. This helps in monitoring and alerting on potentially unstable device paths, which can be an indicator of underlying issues. --- cmd/gce-pd-csi-driver/main.go | 3 ++- pkg/linkcache/devices_linux.go | 21 +++++++++++++-------- pkg/linkcache/devices_windows.go | 3 ++- pkg/linkcache/types.go | 6 ++++-- pkg/metrics/metrics.go | 18 ++++++++++++++++++ 5 files changed, 39 insertions(+), 12 deletions(-) diff --git a/cmd/gce-pd-csi-driver/main.go b/cmd/gce-pd-csi-driver/main.go index 911b6df5a..f2b2e884e 100644 --- a/cmd/gce-pd-csi-driver/main.go +++ b/cmd/gce-pd-csi-driver/main.go @@ -174,6 +174,7 @@ func handle() { klog.Errorf("Failed to emit process start time: %v", err.Error()) } mm.RegisterMountMetric() + mm.RegisterUnexpectedDevicePathChangesMetric() } metricsManager = &mm } @@ -282,7 +283,7 @@ func handle() { klog.Fatalf("Failed to get node info from API server: %v", err.Error()) } - deviceCache, err := linkcache.NewDeviceCacheForNode(ctx, *diskCacheSyncPeriod, *nodeName, driverName, deviceUtils) + deviceCache, err := linkcache.NewDeviceCacheForNode(ctx, *diskCacheSyncPeriod, *nodeName, driverName, deviceUtils, metricsManager) if err != nil { klog.Warningf("Failed to create device cache: %v", err.Error()) } else { diff --git a/pkg/linkcache/devices_linux.go b/pkg/linkcache/devices_linux.go index b6e367d1e..35b97f7e6 100644 --- a/pkg/linkcache/devices_linux.go +++ b/pkg/linkcache/devices_linux.go @@ -12,21 +12,22 @@ import ( "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/common" "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/deviceutils" "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/k8sclient" + "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/metrics" ) const byIdDir = "/dev/disk/by-id" -func NewDeviceCacheForNode(ctx context.Context, period time.Duration, nodeName string, driverName string, deviceUtils deviceutils.DeviceUtils) (*DeviceCache, error) { +func NewDeviceCacheForNode(ctx context.Context, period time.Duration, nodeName string, driverName string, deviceUtils deviceutils.DeviceUtils, metricsManager *metrics.MetricsManager) (*DeviceCache, error) { node, err := k8sclient.GetNodeWithRetry(ctx, nodeName) if err != nil { return nil, fmt.Errorf("failed to get node %s: %w", nodeName, err) } - return newDeviceCacheForNode(period, node, driverName, deviceUtils), nil + return newDeviceCacheForNode(period, node, driverName, deviceUtils, metricsManager), nil } func NewTestDeviceCache(period time.Duration, node *v1.Node) *DeviceCache { - return newDeviceCacheForNode(period, node, "pd.csi.storage.gke.io", deviceutils.NewDeviceUtils()) + return newDeviceCacheForNode(period, node, "pd.csi.storage.gke.io", deviceutils.NewDeviceUtils(), nil) } func NewTestNodeWithVolumes(volumes []string) *v1.Node { @@ -42,12 +43,13 @@ func NewTestNodeWithVolumes(volumes []string) *v1.Node { } } -func newDeviceCacheForNode(period time.Duration, node *v1.Node, driverName string, deviceUtils deviceutils.DeviceUtils) *DeviceCache { +func newDeviceCacheForNode(period time.Duration, node *v1.Node, driverName string, deviceUtils deviceutils.DeviceUtils, metricsManager *metrics.MetricsManager) *DeviceCache { deviceCache := &DeviceCache{ - symlinks: make(map[string]deviceMapping), - period: period, - deviceUtils: deviceUtils, - dir: byIdDir, + symlinks: make(map[string]deviceMapping), + period: period, + deviceUtils: deviceUtils, + dir: byIdDir, + metricsManager: metricsManager, } // Look at the status.volumesInUse field. For each, take the last section @@ -163,6 +165,9 @@ func (d *DeviceCache) listAndUpdate() { // Check if the realPath has changed if realPath != device.realPath { klog.Warningf("Change in device path for volume %s (symlink: %s), previous path: %s, new path: %s", device.volumeID, symlink, device.realPath, realPath) + if d.metricsManager != nil { + d.metricsManager.RecordUnexpectedDevicePathChangesMetric() + } // Update the cache with the new realPath device.realPath = realPath diff --git a/pkg/linkcache/devices_windows.go b/pkg/linkcache/devices_windows.go index d7937bc0b..287ebd835 100644 --- a/pkg/linkcache/devices_windows.go +++ b/pkg/linkcache/devices_windows.go @@ -8,9 +8,10 @@ import ( "k8s.io/klog/v2" "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/deviceutils" + "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/metrics" ) -func NewDeviceCacheForNode(ctx context.Context, period time.Duration, nodeName string, driverName string, deviceUtils deviceutils.DeviceUtils) (*DeviceCache, error) { +func NewDeviceCacheForNode(ctx context.Context, period time.Duration, nodeName string, driverName string, deviceUtils deviceutils.DeviceUtils, metricsManager *metrics.MetricsManager) (*DeviceCache, error) { klog.Infof("NewDeviceCacheForNode is not implemented for Windows") return nil, nil } diff --git a/pkg/linkcache/types.go b/pkg/linkcache/types.go index 04d4688eb..62e98ad20 100644 --- a/pkg/linkcache/types.go +++ b/pkg/linkcache/types.go @@ -5,6 +5,7 @@ import ( "time" "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/deviceutils" + "sigs.k8s.io/gcp-compute-persistent-disk-csi-driver/pkg/metrics" ) type deviceMapping struct { @@ -17,6 +18,7 @@ type DeviceCache struct { symlinks map[string]deviceMapping period time.Duration // dir is the directory to look for device symlinks - dir string - deviceUtils deviceutils.DeviceUtils + dir string + deviceUtils deviceutils.DeviceUtils + metricsManager *metrics.MetricsManager } diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 18e7d6f90..ae007a30c 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -63,6 +63,15 @@ var ( }, []string{"driver_name", "file_system_format", "error_type"}, ) + + unexpectedDevicePathChangesMetric = metrics.NewCounterVec(&metrics.CounterOpts{ + Subsystem: "node", + Name: "unexpected_device_path_changes", + Help: "Unexpected device path changes", + StabilityLevel: metrics.ALPHA, + }, + []string{"driver_name"}, + ) ) type MetricsManager struct { @@ -92,6 +101,10 @@ func (mm *MetricsManager) RegisterMountMetric() { mm.registry.MustRegister(mountErrorMetric) } +func (mm *MetricsManager) RegisterUnexpectedDevicePathChangesMetric() { + mm.registry.MustRegister(unexpectedDevicePathChangesMetric) +} + func (mm *MetricsManager) recordComponentVersionMetric() error { v := getEnvVar(envGKEPDCSIVersion) if v == "" { @@ -121,6 +134,11 @@ func (mm *MetricsManager) RecordMountErrorMetric(fs_format string, err error) { klog.Infof("Recorded mount error type: %q", errType) } +func (mm *MetricsManager) RecordUnexpectedDevicePathChangesMetric() { + unexpectedDevicePathChangesMetric.WithLabelValues(pdcsiDriverName).Inc() + klog.Infof("Recorded unexpected device path change") +} + func (mm *MetricsManager) EmmitProcessStartTime() error { return metrics.RegisterProcessStartTime(mm.registry.Register) }