From 3b3a3768266edf57ada517782072be4568d0474d Mon Sep 17 00:00:00 2001 From: Matt Lord Date: Thu, 23 Oct 2025 20:26:24 +0000 Subject: [PATCH 1/7] Improve cgroup metric management 1. Only support cgroup v2 as v1 is essentially dead in 2025+ 2. Handle initialization only when the cgroup metrics are used a. Today that's only when someone uses --grpc-enable-orca-metrics 3. Properly handle cases where we failed to initialize properly Signed-off-by: Matt Lord --- go/vt/servenv/metrics.go | 12 +-- go/vt/servenv/metrics_cgroup.go | 122 ++++++++++-------------------- go/vt/servenv/metrics_nonlinux.go | 4 +- 3 files changed, 46 insertions(+), 92 deletions(-) diff --git a/go/vt/servenv/metrics.go b/go/vt/servenv/metrics.go index a2b25dc8fc5..52c707f89b3 100644 --- a/go/vt/servenv/metrics.go +++ b/go/vt/servenv/metrics.go @@ -17,24 +17,20 @@ limitations under the License. package servenv func getCpuUsage() float64 { - value, err := getCgroupCpu() - if err == nil { + if value, err := getCgroupCpu(); err == nil { return value } - value, err = getHostCpuUsage() - if err == nil { + if value, err := getHostCpuUsage(); err == nil { return value } return -1 } func getMemoryUsage() float64 { - value, err := getCgroupMemory() - if err == nil { + if value, err := getCgroupMemory(); err == nil { return value } - value, err = getHostMemoryUsage() - if err == nil { + if value, err := getHostMemoryUsage(); err == nil { return value } return -1 diff --git a/go/vt/servenv/metrics_cgroup.go b/go/vt/servenv/metrics_cgroup.go index ae9e8bfd07d..7114b553bc0 100644 --- a/go/vt/servenv/metrics_cgroup.go +++ b/go/vt/servenv/metrics_cgroup.go @@ -23,10 +23,10 @@ import ( "fmt" "math" "runtime" + "sync" "time" "github.com/containerd/cgroups" - "github.com/containerd/cgroups/v3/cgroup1" "github.com/containerd/cgroups/v3/cgroup2" "github.com/shirou/gopsutil/v4/mem" @@ -34,75 +34,52 @@ import ( ) var ( - cgroup2Manager *cgroup2.Manager - cgroup1Manager cgroup1.Cgroup - lastCpu uint64 - lastTime time.Time + cgroupManager *cgroup2.Manager + lastCpu uint64 + lastTime time.Time + nilCgroupManagerErr = fmt.Errorf("cgroup metrics are not properly initialized") + once sync.Once ) -func init() { - if cgroups.Mode() == cgroups.Unified { - manager, err := getCgroup2() - if err != nil { - log.Errorf("Failed to init cgroup2 manager: %v", err) - } - cgroup2Manager = manager - lastCpu, err = getCgroup2CpuUsage() - if err != nil { - log.Errorf("Failed to init cgroup2 cpu %v", err) - } - } else { - cgroup, err := getCgroup1() - if err != nil { - log.Errorf("Failed to init cgroup1 manager: %v", err) - } - cgroup1Manager = cgroup - lastCpu, err = getCgroup1CpuUsage() - if err != nil { - log.Errorf("Failed to init cgroup1 cpu %v", err) - } +func setup() { + if cgroups.Mode() != cgroups.Unified { + log.Warning("cgroup metrics are only supported with cgroup v2") + return } - lastTime = time.Now() -} - -func isCgroupV2() bool { - return cgroups.Mode() == cgroups.Unified -} - -func getCgroup1() (cgroup1.Cgroup, error) { - path := cgroup1.NestedPath("") - cgroup, err := cgroup1.Load(path) + manager, err := getCgroupManager() + if err != nil { + log.Warningf("Failed to init cgroup manager for metrics, will use host metrics: %v", err) + } + cgroupManager = manager + lastCpu, err = getCurrentCgroupCpuUsage() if err != nil { - return nil, fmt.Errorf("cgroup1 manager is nil") + log.Warningf("Failed to get initial cgroup CPU usage: %v", err) } - return cgroup, nil + lastTime = time.Now() } -func getCgroup2() (*cgroup2.Manager, error) { +func getCgroupManager() (*cgroup2.Manager, error) { path, err := cgroup2.NestedGroupPath("") if err != nil { - return nil, fmt.Errorf("failed to load cgroup2 manager: %w", err) + return nil, fmt.Errorf("failed to build nested cgroup paths: %v", err) } cgroupManager, err := cgroup2.Load(path) if err != nil { - return nil, fmt.Errorf("cgroup2 manager is nil") + return nil, fmt.Errorf("failed to load cgroup manager: %v", err) } return cgroupManager, nil } func getCgroupCpuUsage() (float64, error) { + once.Do(setup) var ( currentUsage uint64 err error ) currentTime := time.Now() - if isCgroupV2() { - currentUsage, err = getCgroup2CpuUsage() - } else { - currentUsage, err = getCgroup1CpuUsage() - } + currentUsage, err = getCurrentCgroupCpuUsage() if err != nil { - return -1, fmt.Errorf("Could not read cpu usage") + return -1, fmt.Errorf("failed to read current cgroup CPU usage: %v", err) } duration := currentTime.Sub(lastTime) usage, err := getCpuUsageFromSamples(lastCpu, currentUsage, duration) @@ -114,27 +91,14 @@ func getCgroupCpuUsage() (float64, error) { return usage, nil } -func getCgroupMemoryUsage() (float64, error) { - if isCgroupV2() { - return getCgroup2MemoryUsage() - } else { - return getCgroup1MemoryUsage() +func getCurrentCgroupCpuUsage() (uint64, error) { + once.Do(setup) + if cgroupManager == nil { + return 0, nilCgroupManagerErr } -} - -func getCgroup1CpuUsage() (uint64, error) { - stat1, err := cgroup1Manager.Stat() + stat1, err := cgroupManager.Stat() if err != nil { - return 0, fmt.Errorf("failed to get initial CPU stat: %w", err) - } - currentUsage := stat1.CPU.Usage.Total - return currentUsage, nil -} - -func getCgroup2CpuUsage() (uint64, error) { - stat1, err := cgroup2Manager.Stat() - if err != nil { - return 0, fmt.Errorf("failed to get initial CPU stat: %w", err) + return 0, fmt.Errorf("failed to get initial cgroup CPU stats: %v", err) } currentUsage := stat1.CPU.UsageUsec return currentUsage, nil @@ -154,20 +118,14 @@ func getCpuUsageFromSamples(usage1 uint64, usage2 uint64, interval time.Duration return cpuUsage, nil } -func getCgroup1MemoryUsage() (float64, error) { - stats, err := cgroup1Manager.Stat() - if err != nil { - return -1, fmt.Errorf("failed to get cgroup2 stats: %w", err) +func getCgroupMemoryUsage() (float64, error) { + once.Do(setup) + if cgroupManager == nil { + return -1, nilCgroupManagerErr } - usage := stats.Memory.Usage.Usage - limit := stats.Memory.Usage.Limit - return computeMemoryUsage(usage, limit) -} - -func getCgroup2MemoryUsage() (float64, error) { - stats, err := cgroup2Manager.Stat() + stats, err := cgroupManager.Stat() if err != nil { - return -1, fmt.Errorf("failed to get cgroup2 stats: %w", err) + return -1, fmt.Errorf("failed to get cgroup stats: %w", err) } usage := stats.Memory.Usage limit := stats.Memory.UsageLimit @@ -175,16 +133,16 @@ func getCgroup2MemoryUsage() (float64, error) { } func computeMemoryUsage(usage uint64, limit uint64) (float64, error) { - if usage == 0 || usage == math.MaxUint64 { - return -1, fmt.Errorf("Failed to find memory usage with invalid value: %d", usage) + if usage == 0 || usage == math.MaxUint64 || limit == 0 { + return -1, fmt.Errorf("invalid memory usage value: %d", usage) } if limit == 0 { - return -1, fmt.Errorf("Failed to compute memory usage with invalid limit: %d", limit) + return -1, fmt.Errorf("invalid memory limit: %d", limit) } if limit == math.MaxUint64 { vmem, err := mem.VirtualMemory() if err != nil { - return -1, fmt.Errorf("Failed to fall back to system max memory: %w", err) + return -1, fmt.Errorf("failed to get virtual memory stats: %v", err) } limit = vmem.Total } diff --git a/go/vt/servenv/metrics_nonlinux.go b/go/vt/servenv/metrics_nonlinux.go index 53d854f8c94..bcbd9f1d91f 100644 --- a/go/vt/servenv/metrics_nonlinux.go +++ b/go/vt/servenv/metrics_nonlinux.go @@ -24,9 +24,9 @@ import ( ) func getCgroupCpu() (float64, error) { - return -1, fmt.Errorf("Cgroup not supported on nonlinux platform") + return -1, fmt.Errorf("cgroups not supported on nonlinux platforms") } func getCgroupMemory() (float64, error) { - return -1, fmt.Errorf("Cgroup not supported on nonlinux platform") + return -1, fmt.Errorf("cgroups not supported on nonlinux platforms") } From ea5cbaa6258c4106407676c1e431240f413baf94 Mon Sep 17 00:00:00 2001 From: Matt Lord Date: Thu, 23 Oct 2025 20:46:48 +0000 Subject: [PATCH 2/7] Minor tweaks Signed-off-by: Matt Lord --- go/vt/servenv/metrics_cgroup.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/go/vt/servenv/metrics_cgroup.go b/go/vt/servenv/metrics_cgroup.go index 7114b553bc0..ddc70d34ece 100644 --- a/go/vt/servenv/metrics_cgroup.go +++ b/go/vt/servenv/metrics_cgroup.go @@ -34,11 +34,11 @@ import ( ) var ( - cgroupManager *cgroup2.Manager - lastCpu uint64 - lastTime time.Time - nilCgroupManagerErr = fmt.Errorf("cgroup metrics are not properly initialized") - once sync.Once + once sync.Once + cgroupManager *cgroup2.Manager + lastCpu uint64 + lastTime time.Time + errCgroupMetricsNotAvailable = fmt.Errorf("cgroup metrics are not available") ) func setup() { @@ -94,7 +94,7 @@ func getCgroupCpuUsage() (float64, error) { func getCurrentCgroupCpuUsage() (uint64, error) { once.Do(setup) if cgroupManager == nil { - return 0, nilCgroupManagerErr + return 0, errCgroupMetricsNotAvailable } stat1, err := cgroupManager.Stat() if err != nil { @@ -121,7 +121,7 @@ func getCpuUsageFromSamples(usage1 uint64, usage2 uint64, interval time.Duration func getCgroupMemoryUsage() (float64, error) { once.Do(setup) if cgroupManager == nil { - return -1, nilCgroupManagerErr + return -1, errCgroupMetricsNotAvailable } stats, err := cgroupManager.Stat() if err != nil { @@ -133,7 +133,7 @@ func getCgroupMemoryUsage() (float64, error) { } func computeMemoryUsage(usage uint64, limit uint64) (float64, error) { - if usage == 0 || usage == math.MaxUint64 || limit == 0 { + if usage == 0 || usage == math.MaxUint64 { return -1, fmt.Errorf("invalid memory usage value: %d", usage) } if limit == 0 { From 68c992ec063afbdf8d8a6599df6e1edfe5602a3b Mon Sep 17 00:00:00 2001 From: Matt Lord Date: Thu, 23 Oct 2025 21:50:57 +0000 Subject: [PATCH 3/7] Run go mod tidy Signed-off-by: Matt Lord --- go.mod | 1 - go.sum | 2 -- 2 files changed, 3 deletions(-) diff --git a/go.mod b/go.mod index 9eb06e778c1..448b6fa127f 100644 --- a/go.mod +++ b/go.mod @@ -134,7 +134,6 @@ require ( github.com/google/btree v1.1.3 // indirect github.com/google/pprof v0.0.0-20250820193118-f64d9cf942d6 // indirect github.com/json-iterator/go v1.1.12 // indirect - github.com/moby/sys/userns v0.1.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/olekukonko/cat v0.0.0-20250911104152-50322a0618f6 // indirect diff --git a/go.sum b/go.sum index ebd3bd8e61f..f2761eff8fa 100644 --- a/go.sum +++ b/go.sum @@ -428,8 +428,6 @@ github.com/minio/minio-go v0.0.0-20190131015406-c8a261de75c1 h1:jw16EimP5oAEM/2w github.com/minio/minio-go v0.0.0-20190131015406-c8a261de75c1/go.mod h1:vuvdOZLJuf5HmJAJrKV64MmozrSsk+or0PB5dzdfspg= github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= -github.com/moby/sys/userns v0.1.0 h1:tVLXkFOxVu9A64/yh59slHVv9ahO9UIev4JZusOLG/g= -github.com/moby/sys/userns v0.1.0/go.mod h1:IHUYgu/kao6N8YZlp9Cf444ySSvCmDlmzUcYfDHOl28= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= From 3055d86763359081952f9d60736fdee4e48f4792 Mon Sep 17 00:00:00 2001 From: Matt Lord Date: Thu, 23 Oct 2025 21:52:23 +0000 Subject: [PATCH 4/7] Improve log message Signed-off-by: Matt Lord --- go/vt/servenv/metrics_cgroup.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/vt/servenv/metrics_cgroup.go b/go/vt/servenv/metrics_cgroup.go index ddc70d34ece..9afb0676efa 100644 --- a/go/vt/servenv/metrics_cgroup.go +++ b/go/vt/servenv/metrics_cgroup.go @@ -43,7 +43,7 @@ var ( func setup() { if cgroups.Mode() != cgroups.Unified { - log.Warning("cgroup metrics are only supported with cgroup v2") + log.Warning("cgroup metrics are only supported with cgroup v2, will use host metrics") return } manager, err := getCgroupManager() From 7d04b3fa24e1e2288a38034008aeac94cc4325cd Mon Sep 17 00:00:00 2001 From: Matt Lord Date: Thu, 23 Oct 2025 22:49:00 +0000 Subject: [PATCH 5/7] Remove nested once usage Signed-off-by: Matt Lord --- go/vt/servenv/metrics_cgroup.go | 1 - 1 file changed, 1 deletion(-) diff --git a/go/vt/servenv/metrics_cgroup.go b/go/vt/servenv/metrics_cgroup.go index 9afb0676efa..0cb07133630 100644 --- a/go/vt/servenv/metrics_cgroup.go +++ b/go/vt/servenv/metrics_cgroup.go @@ -92,7 +92,6 @@ func getCgroupCpuUsage() (float64, error) { } func getCurrentCgroupCpuUsage() (uint64, error) { - once.Do(setup) if cgroupManager == nil { return 0, errCgroupMetricsNotAvailable } From 9aaba25a04d05f9be825c6bec64b7d861213bfac Mon Sep 17 00:00:00 2001 From: Matt Lord Date: Fri, 24 Oct 2025 16:33:11 +0000 Subject: [PATCH 6/7] Add unit test Signed-off-by: Matt Lord --- go/vt/servenv/metrics_cgroup_test.go | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/go/vt/servenv/metrics_cgroup_test.go b/go/vt/servenv/metrics_cgroup_test.go index 66069d1fcaf..01308ea4ec3 100644 --- a/go/vt/servenv/metrics_cgroup_test.go +++ b/go/vt/servenv/metrics_cgroup_test.go @@ -21,6 +21,8 @@ package servenv import ( "testing" + + "github.com/stretchr/testify/require" ) func TestGetCGroupCpuUsageMetrics(t *testing.T) { @@ -35,3 +37,25 @@ func TestGetCgroupMemoryUsageMetrics(t *testing.T) { validateMem(t, mem, err) t.Logf("mem %.5f", mem) } + +func TestErrHandlingWithCgroups(t *testing.T) { + origCgroupManager := cgroupManager + defer func() { + cgroupManager = origCgroupManager + }() + + cpu, err := getCgroupCpuUsage() + validateCpu(t, cpu, err) + mem, err := getCgroupMemoryUsage() + validateMem(t, mem, err) + + cgroupManager = nil + require.Nil(t, cgroupManager) + + cpu, err = getCgroupCpuUsage() + require.ErrorContains(t, err, errCgroupMetricsNotAvailable.Error()) + require.Equal(t, int(cpu), -1) + mem, err = getCgroupMemoryUsage() + require.ErrorContains(t, err, errCgroupMetricsNotAvailable.Error()) + require.Equal(t, int(mem), -1) +} From 89ca16caa6aa31d1d1d293a295824ade85e12ce9 Mon Sep 17 00:00:00 2001 From: Matt Lord Date: Fri, 24 Oct 2025 17:31:53 +0000 Subject: [PATCH 7/7] Wrap errors Signed-off-by: Matt Lord --- go/vt/servenv/metrics_cgroup.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/go/vt/servenv/metrics_cgroup.go b/go/vt/servenv/metrics_cgroup.go index 0cb07133630..00e461625bc 100644 --- a/go/vt/servenv/metrics_cgroup.go +++ b/go/vt/servenv/metrics_cgroup.go @@ -61,11 +61,11 @@ func setup() { func getCgroupManager() (*cgroup2.Manager, error) { path, err := cgroup2.NestedGroupPath("") if err != nil { - return nil, fmt.Errorf("failed to build nested cgroup paths: %v", err) + return nil, fmt.Errorf("failed to build nested cgroup paths: %w", err) } cgroupManager, err := cgroup2.Load(path) if err != nil { - return nil, fmt.Errorf("failed to load cgroup manager: %v", err) + return nil, fmt.Errorf("failed to load cgroup manager: %w", err) } return cgroupManager, nil } @@ -79,7 +79,7 @@ func getCgroupCpuUsage() (float64, error) { currentTime := time.Now() currentUsage, err = getCurrentCgroupCpuUsage() if err != nil { - return -1, fmt.Errorf("failed to read current cgroup CPU usage: %v", err) + return -1, fmt.Errorf("failed to read current cgroup CPU usage: %w", err) } duration := currentTime.Sub(lastTime) usage, err := getCpuUsageFromSamples(lastCpu, currentUsage, duration) @@ -97,7 +97,7 @@ func getCurrentCgroupCpuUsage() (uint64, error) { } stat1, err := cgroupManager.Stat() if err != nil { - return 0, fmt.Errorf("failed to get initial cgroup CPU stats: %v", err) + return 0, fmt.Errorf("failed to get initial cgroup CPU stats: %w", err) } currentUsage := stat1.CPU.UsageUsec return currentUsage, nil @@ -141,7 +141,7 @@ func computeMemoryUsage(usage uint64, limit uint64) (float64, error) { if limit == math.MaxUint64 { vmem, err := mem.VirtualMemory() if err != nil { - return -1, fmt.Errorf("failed to get virtual memory stats: %v", err) + return -1, fmt.Errorf("failed to get virtual memory stats: %w", err) } limit = vmem.Total }