From 5cfb4c0238c7c078c06bf0f117b9106547551362 Mon Sep 17 00:00:00 2001 From: twthorn Date: Tue, 20 May 2025 12:34:56 -0700 Subject: [PATCH 01/16] Add support for sending grpc server backend metrics via ORCA Signed-off-by: twthorn --- go.mod | 8 ++++++ go.sum | 21 ++++++++++++++++ go/vt/servenv/grpc_server.go | 41 +++++++++++++++++++++++++++++++ go/vt/servenv/grpc_server_test.go | 17 +++++++++++++ 4 files changed, 87 insertions(+) diff --git a/go.mod b/go.mod index 5d4de9cf212..d1aa450edda 100644 --- a/go.mod +++ b/go.mod @@ -161,6 +161,7 @@ require ( github.com/go-jose/go-jose/v4 v4.1.0 // indirect github.com/go-logr/logr v1.4.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect + github.com/go-ole/go-ole v1.2.6 // indirect github.com/go-viper/mapstructure/v2 v2.2.1 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/google/s2a-go v0.1.9 // indirect @@ -177,6 +178,7 @@ require ( github.com/hashicorp/go-sockaddr v1.0.7 // indirect github.com/hashicorp/golang-lru v1.0.2 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect + github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect github.com/mattn/go-colorable v0.1.14 // indirect github.com/mattn/go-ieproxy v0.0.12 // indirect github.com/mattn/go-isatty v0.0.20 // indirect @@ -189,6 +191,7 @@ require ( github.com/outcaste-io/ristretto v0.2.3 // indirect github.com/pelletier/go-toml/v2 v2.2.4 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect + github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/procfs v0.16.1 // indirect github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect @@ -198,12 +201,17 @@ require ( github.com/ryanuber/go-glob v1.0.0 // indirect github.com/sagikazarmark/locafero v0.9.0 // indirect github.com/secure-systems-lab/go-securesystemslib v0.9.0 // indirect + github.com/shirou/gopsutil/v3 v3.24.5 // indirect + github.com/shirou/gopsutil/v4 v4.25.4 // indirect github.com/sourcegraph/conc v0.3.0 // indirect github.com/spf13/cast v1.7.1 // indirect github.com/spiffe/go-spiffe/v2 v2.5.0 // indirect github.com/subosito/gotenv v1.6.0 // indirect github.com/tidwall/match v1.1.1 // indirect github.com/tidwall/pretty v1.2.1 // indirect + github.com/tklauser/go-sysconf v0.3.12 // indirect + github.com/tklauser/numcpus v0.6.1 // indirect + github.com/yusufpapurcu/wmi v1.2.4 // indirect github.com/zeebo/errs v1.4.0 // indirect go.opentelemetry.io/auto/sdk v1.1.0 // indirect go.opentelemetry.io/contrib/detectors/gcp v1.35.0 // indirect diff --git a/go.sum b/go.sum index 47a9f378079..fb122e7ce51 100644 --- a/go.sum +++ b/go.sum @@ -212,6 +212,8 @@ github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= +github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= github.com/go-sql-driver/mysql v1.7.1 h1:lUIinVbN1DY0xBg0eMOzmmtGoHwWBbvnWubQUrtU8EI= github.com/go-sql-driver/mysql v1.7.1/go.mod h1:OXbVy3sEdcQ2Doequ6Z5BW6fXNQTmx+9S1MCJN5yJMI= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= @@ -250,6 +252,7 @@ github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMyw github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= @@ -359,6 +362,8 @@ github.com/krishicks/yaml-patch v0.0.10 h1:H4FcHpnNwVmw8u0MjPRjWyIXtco6zM2F78t+5 github.com/krishicks/yaml-patch v0.0.10/go.mod h1:Sm5TchwZS6sm7RJoyg87tzxm2ZcKzdRE4Q7TjNhPrME= github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= +github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 h1:6E+4a0GO5zZEnZ81pIr0yLvtUWk2if982qA3F3QD6H4= +github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I= github.com/mattn/go-colorable v0.1.9/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc= github.com/mattn/go-colorable v0.1.12/go.mod h1:u5H1YNBxpqRaxsYJYSkiCWKzEfiAb1Gb520KVy5xxl4= github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE= @@ -441,6 +446,8 @@ github.com/planetscale/vtprotobuf v0.6.1-0.20241121165744-79df5c4772f2/go.mod h1 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c h1:ncq/mPwQF4JjgDlrVEn3C11VoGHZN7m8qihwgMEtzYw= +github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= github.com/prometheus/client_golang v1.4.0/go.mod h1:e9GMxYsXl05ICDXkRhurwBS4Q3OK1iX/F2sw+iXX5zU= @@ -487,6 +494,10 @@ github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 h1:nn5Wsu0esKSJiIVhscUt github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= github.com/secure-systems-lab/go-securesystemslib v0.9.0 h1:rf1HIbL64nUpEIZnjLZ3mcNEL9NBPB0iuVjyxvq3LZc= github.com/secure-systems-lab/go-securesystemslib v0.9.0/go.mod h1:DVHKMcZ+V4/woA/peqr+L0joiRXbPpQ042GgJckkFgw= +github.com/shirou/gopsutil/v3 v3.24.5 h1:i0t8kL+kQTvpAYToeuiVk3TgDeKOFioZO3Ztz/iZ9pI= +github.com/shirou/gopsutil/v3 v3.24.5/go.mod h1:bsoOS1aStSs9ErQ1WWfxllSeS1K5D+U30r2NfcubMVk= +github.com/shirou/gopsutil/v4 v4.25.4 h1:cdtFO363VEOOFrUCjZRh4XVJkb548lyF0q0uTeMqYPw= +github.com/shirou/gopsutil/v4 v4.25.4/go.mod h1:xbuxyoZj+UsgnZrENu3lQivsngRR5BdjbJwf2fv4szA= github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88= @@ -545,6 +556,10 @@ github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4= github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= github.com/tinylib/msgp v1.2.5 h1:WeQg1whrXRFiZusidTQqzETkRpGjFjcIhW6uqWH09po= github.com/tinylib/msgp v1.2.5/go.mod h1:ykjzy2wzgrlvpDCRc4LA8UXy6D8bzMSuAF3WD57Gok0= +github.com/tklauser/go-sysconf v0.3.12 h1:0QaGUFOdQaIVdPgfITYzaTegZvdCjmYO52cSFAEVmqU= +github.com/tklauser/go-sysconf v0.3.12/go.mod h1:Ho14jnntGE1fpdOqQEEaiKRpvIavV0hSfmBq8nJbHYI= +github.com/tklauser/numcpus v0.6.1 h1:ng9scYS7az0Bk4OZLvrNXNSAO2Pxr1XXRAPyjhIx+Fk= +github.com/tklauser/numcpus v0.6.1/go.mod h1:1XfjsgE2zo8GVw7POkMbHENHzVg3GzmoZ9fESEdAacY= github.com/tv42/httpunix v0.0.0-20150427012821-b75d8614f926/go.mod h1:9ESjWnEqriFuLhtthL60Sar/7RFoluCcXsuvEwTV5KM= github.com/uber/jaeger-client-go v2.30.0+incompatible h1:D6wyKGCecFaSRUpo8lCVbaOOb6ThwMmTEbhRwtKR97o= github.com/uber/jaeger-client-go v2.30.0+incompatible/go.mod h1:WVhlPFC8FDjOFMMWRy2pZqQJSXxYSwNYOkTr/Z6d3Kk= @@ -557,6 +572,8 @@ github.com/yudai/golcs v0.0.0-20170316035057-ecda9a501e82/go.mod h1:lgjkn3NuSvDf github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= +github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0= +github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= github.com/z-division/go-zookeeper v1.0.0 h1:ULsCj0nP6+U1liDFWe+2oEF6o4amixoDcDlwEUghVUY= github.com/z-division/go-zookeeper v1.0.0/go.mod h1:6X4UioQXpvyezJJl4J9NHAJKsoffCwy5wCaaTktXjOA= github.com/zeebo/errs v1.4.0 h1:XNdoD/RRMKP7HD0UhJnIzUy74ISdGGxURlYG8HSWSfM= @@ -667,6 +684,7 @@ golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190904154756-749cb33beabd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191005200804-aed5e4c7ecf9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191112214154-59a1497f0cea/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -680,6 +698,7 @@ golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20200625212154-ddb9806d33ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210112080510-489259a85091/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210330210617-4fbd30eecc44/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -692,6 +711,8 @@ golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220503163025-988cb79eb6c6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220627191245-f75cf1eec38b/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20= golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= diff --git a/go/vt/servenv/grpc_server.go b/go/vt/servenv/grpc_server.go index 8d30ee6d253..9124c945d5b 100644 --- a/go/vt/servenv/grpc_server.go +++ b/go/vt/servenv/grpc_server.go @@ -26,12 +26,15 @@ import ( grpc_middleware "github.com/grpc-ecosystem/go-grpc-middleware" grpc_prometheus "github.com/grpc-ecosystem/go-grpc-prometheus" + "github.com/shirou/gopsutil/v4/cpu" + "github.com/shirou/gopsutil/v4/mem" "github.com/spf13/pflag" "google.golang.org/grpc" "google.golang.org/grpc/credentials" "google.golang.org/grpc/health" healthpb "google.golang.org/grpc/health/grpc_health_v1" "google.golang.org/grpc/keepalive" + "google.golang.org/grpc/orca" "google.golang.org/grpc/reflection" "vitess.io/vitess/go/trace" @@ -100,6 +103,8 @@ var ( // there are no active streams, server will send GOAWAY and close the connection. gRPCKeepAliveEnforcementPolicyPermitWithoutStream bool + orcaRecorder orca.ServerMetricsRecorder + gRPCKeepaliveTime = 10 * time.Second gRPCKeepaliveTimeout = 10 * time.Second ) @@ -224,6 +229,9 @@ func createGRPCServer() { opts = append(opts, grpc.MaxRecvMsgSize(msgSize)) opts = append(opts, grpc.MaxSendMsgSize(msgSize)) + opts = append(opts, orca.CallMetricsServerOption(nil)) + orcaRecorder = orca.NewServerMetricsRecorder() + if gRPCInitialConnWindowSize != 0 { log.Infof("Setting grpc server initial conn window size to %d", int32(gRPCInitialConnWindowSize)) opts = append(opts, grpc.InitialConnWindowSize(int32(gRPCInitialConnWindowSize))) @@ -287,6 +295,23 @@ func serveGRPC() { return } + if err := orca.Register(GRPCServer, orca.ServiceOptions{ + // The minimum interval of orca is 30 seconds, unless we enable a testing flag. + MinReportingInterval: 30 * time.Second, + ServerMetricsProvider: orcaRecorder, + }); err != nil { + log.Exitf("Failed to register ORCA service: %v", err) + } + + go func() { + ticker := time.NewTicker(30 * time.Second) + defer ticker.Stop() + for range ticker.C { + orcaRecorder.SetCPUUtilization(getCPUUsage()) + orcaRecorder.SetMemoryUtilization(getMemoryUsage()) + } + }() + // register reflection to support list calls :) reflection.Register(GRPCServer) @@ -325,6 +350,22 @@ func serveGRPC() { }) } +func getCPUUsage() float64 { + percentages, err := cpu.Percent(0, false) + if err != nil || len(percentages) == 0 { + return 0 + } + return percentages[0] +} + +func getMemoryUsage() float64 { + vmStat, err := mem.VirtualMemory() + if err != nil { + return 0 + } + return vmStat.UsedPercent +} + // GRPCCheckServiceMap returns if we should register a gRPC service // (and also logs how to enable / disable it) func GRPCCheckServiceMap(name string) bool { diff --git a/go/vt/servenv/grpc_server_test.go b/go/vt/servenv/grpc_server_test.go index 56387574276..324d5d6537e 100644 --- a/go/vt/servenv/grpc_server_test.go +++ b/go/vt/servenv/grpc_server_test.go @@ -22,6 +22,7 @@ import ( "context" "google.golang.org/grpc" + "google.golang.org/grpc/orca" ) func TestEmpty(t *testing.T) { @@ -61,6 +62,22 @@ func TestDoubleInterceptor(t *testing.T) { } } +func TestOrcaMetricsRecorder(t *testing.T) { + recorder := orca.NewServerMetricsRecorder() + + recorder.SetCPUUtilization(0.25) + recorder.SetMemoryUtilization(0.5) + + snap := recorder.ServerMetrics() + + if snap.CPUUtilization != 0.25 { + t.Errorf("expected cpu 0.25, got %v", snap.CPUUtilization) + } + if snap.MemUtilization != 0.5 { + t.Errorf("expected memory 0.5, got %v", snap.MemUtilization) + } +} + type FakeInterceptor struct { name string streamSeen any From 1b8deae643f30a1f56c5690e02a690ad6f28f104 Mon Sep 17 00:00:00 2001 From: twthorn Date: Wed, 21 May 2025 14:11:35 -0700 Subject: [PATCH 02/16] Rerun go mod tidy Signed-off-by: twthorn --- go.mod | 3 +-- go.sum | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/go.mod b/go.mod index d1aa450edda..0da3826ab96 100644 --- a/go.mod +++ b/go.mod @@ -103,6 +103,7 @@ require ( github.com/kr/text v0.2.0 github.com/mitchellh/mapstructure v1.5.1-0.20231216201459-8508981c8b6c github.com/nsf/jsondiff v0.0.0-20210926074059-1e845ec5d249 + github.com/shirou/gopsutil/v4 v4.25.4 github.com/spf13/afero v1.14.0 github.com/spf13/jwalterweatherman v1.1.0 github.com/xlab/treeprint v1.2.0 @@ -201,8 +202,6 @@ require ( github.com/ryanuber/go-glob v1.0.0 // indirect github.com/sagikazarmark/locafero v0.9.0 // indirect github.com/secure-systems-lab/go-securesystemslib v0.9.0 // indirect - github.com/shirou/gopsutil/v3 v3.24.5 // indirect - github.com/shirou/gopsutil/v4 v4.25.4 // indirect github.com/sourcegraph/conc v0.3.0 // indirect github.com/spf13/cast v1.7.1 // indirect github.com/spiffe/go-spiffe/v2 v2.5.0 // indirect diff --git a/go.sum b/go.sum index fb122e7ce51..622e3bd07b0 100644 --- a/go.sum +++ b/go.sum @@ -494,8 +494,6 @@ github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 h1:nn5Wsu0esKSJiIVhscUt github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= github.com/secure-systems-lab/go-securesystemslib v0.9.0 h1:rf1HIbL64nUpEIZnjLZ3mcNEL9NBPB0iuVjyxvq3LZc= github.com/secure-systems-lab/go-securesystemslib v0.9.0/go.mod h1:DVHKMcZ+V4/woA/peqr+L0joiRXbPpQ042GgJckkFgw= -github.com/shirou/gopsutil/v3 v3.24.5 h1:i0t8kL+kQTvpAYToeuiVk3TgDeKOFioZO3Ztz/iZ9pI= -github.com/shirou/gopsutil/v3 v3.24.5/go.mod h1:bsoOS1aStSs9ErQ1WWfxllSeS1K5D+U30r2NfcubMVk= github.com/shirou/gopsutil/v4 v4.25.4 h1:cdtFO363VEOOFrUCjZRh4XVJkb548lyF0q0uTeMqYPw= github.com/shirou/gopsutil/v4 v4.25.4/go.mod h1:xbuxyoZj+UsgnZrENu3lQivsngRR5BdjbJwf2fv4szA= github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= From ff6c1bc8b81dbd24a0d9d2a16c9328e6e9a1f29c Mon Sep 17 00:00:00 2001 From: twthorn Date: Thu, 22 May 2025 10:10:13 -0700 Subject: [PATCH 03/16] Add flag for orca metrics Signed-off-by: twthorn --- go/vt/servenv/grpc_server.go | 54 ++++++++++++++++++++----------- go/vt/servenv/grpc_server_test.go | 53 ++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 18 deletions(-) diff --git a/go/vt/servenv/grpc_server.go b/go/vt/servenv/grpc_server.go index 9124c945d5b..aebb340329b 100644 --- a/go/vt/servenv/grpc_server.go +++ b/go/vt/servenv/grpc_server.go @@ -67,6 +67,9 @@ var ( // GRPCServer is the global server to serve gRPC. GRPCServer *grpc.Server + // GRPC server metrics recorder + GRPCServerMetricsRecorder orca.ServerMetricsRecorder + authPlugin Authenticator ) @@ -103,12 +106,18 @@ var ( // there are no active streams, server will send GOAWAY and close the connection. gRPCKeepAliveEnforcementPolicyPermitWithoutStream bool - orcaRecorder orca.ServerMetricsRecorder + // Enable ORCA metrics to be sent from the server to the client to be used for load balancing + gRPCEnableOrcaMetrics bool gRPCKeepaliveTime = 10 * time.Second gRPCKeepaliveTimeout = 10 * time.Second ) +// Injectable behavior for testing +var ( + registerORCA = orca.Register +) + // TLS variables. var ( // gRPCCert is the cert to use if TLS is enabled. @@ -142,6 +151,7 @@ func RegisterGRPCServerFlags() { fs.IntVar(&gRPCInitialWindowSize, "grpc_server_initial_window_size", gRPCInitialWindowSize, "gRPC server initial window size") fs.DurationVar(&gRPCKeepAliveEnforcementPolicyMinTime, "grpc_server_keepalive_enforcement_policy_min_time", gRPCKeepAliveEnforcementPolicyMinTime, "gRPC server minimum keepalive time") fs.BoolVar(&gRPCKeepAliveEnforcementPolicyPermitWithoutStream, "grpc_server_keepalive_enforcement_policy_permit_without_stream", gRPCKeepAliveEnforcementPolicyPermitWithoutStream, "gRPC server permit client keepalive pings even when there are no active streams (RPCs)") + fs.BoolVar(&gRPCEnableOrcaMetrics, "grpc_enable_orca_metrics", gRPCEnableOrcaMetrics, "gRPC server option to enable sending ORCA metrics to clients for load balancing") fs.StringVar(&gRPCCert, "grpc_cert", gRPCCert, "server certificate to use for gRPC connections, requires grpc_key, enables TLS") fs.StringVar(&gRPCKey, "grpc_key", gRPCKey, "server private key to use for gRPC connections, requires grpc_cert, enables TLS") @@ -229,8 +239,10 @@ func createGRPCServer() { opts = append(opts, grpc.MaxRecvMsgSize(msgSize)) opts = append(opts, grpc.MaxSendMsgSize(msgSize)) - opts = append(opts, orca.CallMetricsServerOption(nil)) - orcaRecorder = orca.NewServerMetricsRecorder() + if gRPCEnableOrcaMetrics { + GRPCServerMetricsRecorder = orca.NewServerMetricsRecorder() + opts = append(opts, orca.CallMetricsServerOption(GRPCServerMetricsRecorder)) + } if gRPCInitialConnWindowSize != 0 { log.Infof("Setting grpc server initial conn window size to %d", int32(gRPCInitialConnWindowSize)) @@ -295,23 +307,10 @@ func serveGRPC() { return } - if err := orca.Register(GRPCServer, orca.ServiceOptions{ - // The minimum interval of orca is 30 seconds, unless we enable a testing flag. - MinReportingInterval: 30 * time.Second, - ServerMetricsProvider: orcaRecorder, - }); err != nil { - log.Exitf("Failed to register ORCA service: %v", err) + if gRPCEnableOrcaMetrics { + registerOrca() } - go func() { - ticker := time.NewTicker(30 * time.Second) - defer ticker.Stop() - for range ticker.C { - orcaRecorder.SetCPUUtilization(getCPUUsage()) - orcaRecorder.SetMemoryUtilization(getMemoryUsage()) - } - }() - // register reflection to support list calls :) reflection.Register(GRPCServer) @@ -350,6 +349,25 @@ func serveGRPC() { }) } +func registerOrca() { + if err := registerORCA(GRPCServer, orca.ServiceOptions{ + // The minimum interval of orca is 30 seconds, unless we enable a testing flag. + MinReportingInterval: 30 * time.Second, + ServerMetricsProvider: GRPCServerMetricsRecorder, + }); err != nil { + log.Exitf("Failed to register ORCA service: %v", err) + } + + go func() { + ticker := time.NewTicker(30 * time.Second) + defer ticker.Stop() + for range ticker.C { + GRPCServerMetricsRecorder.SetCPUUtilization(getCPUUsage()) + GRPCServerMetricsRecorder.SetMemoryUtilization(getMemoryUsage()) + } + }() +} + func getCPUUsage() float64 { percentages, err := cpu.Percent(0, false) if err != nil || len(percentages) == 0 { diff --git a/go/vt/servenv/grpc_server_test.go b/go/vt/servenv/grpc_server_test.go index 324d5d6537e..73b6af71eae 100644 --- a/go/vt/servenv/grpc_server_test.go +++ b/go/vt/servenv/grpc_server_test.go @@ -78,6 +78,59 @@ func TestOrcaMetricsRecorder(t *testing.T) { } } +func TestEnableOrcaMetrics(t *testing.T) { + // Set the port to enable gRPC server + gRPCPort = 1000 + originalFlag := gRPCEnableOrcaMetrics + defer func() { gRPCEnableOrcaMetrics = originalFlag }() + originalRecorder := GRPCServerMetricsRecorder + defer func() { GRPCServerMetricsRecorder = originalRecorder }() + + gRPCEnableOrcaMetrics = true + GRPCServerMetricsRecorder = nil + createGRPCServer() + if GRPCServerMetricsRecorder == nil { + t.Errorf("GRPCServerMetricsRecorder should be initialized when gRPCEnableOrcaMetrics is true") + } + + called := false + registerORCA = func(s grpc.ServiceRegistrar, o orca.ServiceOptions) error { + called = true + return nil + } + defer func() { registerORCA = orca.Register }() + serveGRPC() + if !called { + t.Errorf("registerORCA should have been called when ORCA metrics are enabled") + } +} + +func TestDisableOrcaMetrics(t *testing.T) { + // Set the port to enable gRPC server + gRPCPort = 1001 + originalFlag := gRPCEnableOrcaMetrics + defer func() { gRPCEnableOrcaMetrics = originalFlag }() + originalRecorder := GRPCServerMetricsRecorder + defer func() { GRPCServerMetricsRecorder = originalRecorder }() + GRPCServerMetricsRecorder = nil + gRPCEnableOrcaMetrics = false + createGRPCServer() + if GRPCServerMetricsRecorder != nil { + t.Errorf("GRPCServerMetricsRecorder should NOT be initialized when gRPCEnableOrcaMetrics is false") + } + + called := false + registerORCA = func(s grpc.ServiceRegistrar, o orca.ServiceOptions) error { + called = true + return nil + } + defer func() { registerORCA = orca.Register }() + serveGRPC() + if called { + t.Errorf("registerORCA should NOT have been called when ORCA metrics are enabled") + } +} + type FakeInterceptor struct { name string streamSeen any From 45b012eb30f936516e6ad7b8ba481aae5edc9c43 Mon Sep 17 00:00:00 2001 From: twthorn Date: Thu, 22 May 2025 12:59:56 -0700 Subject: [PATCH 04/16] Read mem/cpu from cgroup, if not possible, fall back to system Signed-off-by: twthorn --- go/vt/servenv/grpc_server.go | 47 +++++++++++++++++++++++++++-- go/vt/servenv/grpc_server_test.go | 49 +++++++++++++++++++++++-------- 2 files changed, 82 insertions(+), 14 deletions(-) diff --git a/go/vt/servenv/grpc_server.go b/go/vt/servenv/grpc_server.go index aebb340329b..2a06930606d 100644 --- a/go/vt/servenv/grpc_server.go +++ b/go/vt/servenv/grpc_server.go @@ -21,12 +21,14 @@ import ( "crypto/tls" "math" "net" + "runtime" "strconv" "time" grpc_middleware "github.com/grpc-ecosystem/go-grpc-middleware" grpc_prometheus "github.com/grpc-ecosystem/go-grpc-prometheus" "github.com/shirou/gopsutil/v4/cpu" + "github.com/shirou/gopsutil/v4/docker" "github.com/shirou/gopsutil/v4/mem" "github.com/spf13/pflag" "google.golang.org/grpc" @@ -358,6 +360,10 @@ func registerOrca() { log.Exitf("Failed to register ORCA service: %v", err) } + // Initialize the server metrics values + GRPCServerMetricsRecorder.SetCPUUtilization(getCPUUsage()) + GRPCServerMetricsRecorder.SetMemoryUtilization(getMemoryUsage()) + go func() { ticker := time.NewTicker(30 * time.Second) defer ticker.Stop() @@ -369,19 +375,56 @@ func registerOrca() { } func getCPUUsage() float64 { + if isLinux() { + if cpuUsage := getCGroupCpuUsage(); cpuUsage != -1 { + return cpuUsage + } + } + percentages, err := cpu.Percent(0, false) if err != nil || len(percentages) == 0 { return 0 } - return percentages[0] + return percentages[0] / 100.0 } func getMemoryUsage() float64 { + if isLinux() { + if stats, err := docker.CgroupMem("self", "/sys/fs/cgroup"); err == nil && stats != nil { + return float64(stats.MemUsageInBytes) / float64(stats.MemLimitInBytes) + } + } + vmStat, err := mem.VirtualMemory() if err != nil { return 0 } - return vmStat.UsedPercent + return vmStat.UsedPercent / 100.0 +} + +func isLinux() bool { + return runtime.GOOS == "linux" +} + +func getCGroupCpuUsage() float64 { + const interval = 100 * time.Millisecond + + start, err := docker.CgroupCPU("self", "/sys/fs/cgroup") + if err != nil || start == nil { + return -1 + } + + time.Sleep(interval) + + end, err := docker.CgroupCPU("self", "/sys/fs/cgroup") + if err != nil || end == nil { + return -1 + } + + delta := end.Usage - start.Usage // in nanoseconds + numCPU := float64(runtime.NumCPU()) + usage := float64(delta) / (float64(interval.Nanoseconds()) * numCPU) + return usage } // GRPCCheckServiceMap returns if we should register a gRPC service diff --git a/go/vt/servenv/grpc_server_test.go b/go/vt/servenv/grpc_server_test.go index 73b6af71eae..010842c3472 100644 --- a/go/vt/servenv/grpc_server_test.go +++ b/go/vt/servenv/grpc_server_test.go @@ -80,11 +80,9 @@ func TestOrcaMetricsRecorder(t *testing.T) { func TestEnableOrcaMetrics(t *testing.T) { // Set the port to enable gRPC server - gRPCPort = 1000 - originalFlag := gRPCEnableOrcaMetrics - defer func() { gRPCEnableOrcaMetrics = originalFlag }() - originalRecorder := GRPCServerMetricsRecorder - defer func() { GRPCServerMetricsRecorder = originalRecorder }() + withTempVar(&gRPCPort, 1000) + withTempVar(&gRPCEnableOrcaMetrics, true) + withTempVar(&GRPCServerMetricsRecorder, nil) gRPCEnableOrcaMetrics = true GRPCServerMetricsRecorder = nil @@ -107,13 +105,10 @@ func TestEnableOrcaMetrics(t *testing.T) { func TestDisableOrcaMetrics(t *testing.T) { // Set the port to enable gRPC server - gRPCPort = 1001 - originalFlag := gRPCEnableOrcaMetrics - defer func() { gRPCEnableOrcaMetrics = originalFlag }() - originalRecorder := GRPCServerMetricsRecorder - defer func() { GRPCServerMetricsRecorder = originalRecorder }() - GRPCServerMetricsRecorder = nil - gRPCEnableOrcaMetrics = false + withTempVar(&gRPCPort, 10001) + withTempVar(&gRPCEnableOrcaMetrics, false) + withTempVar(&GRPCServerMetricsRecorder, nil) + createGRPCServer() if GRPCServerMetricsRecorder != nil { t.Errorf("GRPCServerMetricsRecorder should NOT be initialized when gRPCEnableOrcaMetrics is false") @@ -131,6 +126,36 @@ func TestDisableOrcaMetrics(t *testing.T) { } } +func TestReportedOrcaMetrics(t *testing.T) { + // Set the port to enable gRPC server + withTempVar(&gRPCPort, 10003) + withTempVar(&gRPCEnableOrcaMetrics, true) + withTempVar(&GRPCServerMetricsRecorder, nil) + + createGRPCServer() + if GRPCServerMetricsRecorder == nil { + t.Errorf("GRPCServerMetricsRecorder should be initialized when gRPCEnableOrcaMetrics is false") + } + + serveGRPC() + serverMetrics := GRPCServerMetricsRecorder.ServerMetrics() + if cpuUsage := serverMetrics.CPUUtilization; cpuUsage < 0 { + t.Errorf("CPU Utilization is not set %.2f", cpuUsage) + } + + if memUsage := serverMetrics.MemUtilization; memUsage < 0 { + t.Errorf("Mem Utilization is not set %.2f", memUsage) + } +} + +func withTempVar[T any](set *T, temp T) (restore func()) { + original := *set + *set = temp + return func() { + *set = original + } +} + type FakeInterceptor struct { name string streamSeen any From a021590e1a1fce390cdf74b333a0081ece5dd7c4 Mon Sep 17 00:00:00 2001 From: twthorn Date: Thu, 22 May 2025 14:21:52 -0700 Subject: [PATCH 05/16] Use a free port Signed-off-by: twthorn --- go/vt/servenv/grpc_server_test.go | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/go/vt/servenv/grpc_server_test.go b/go/vt/servenv/grpc_server_test.go index 010842c3472..ffd7ea2b3e4 100644 --- a/go/vt/servenv/grpc_server_test.go +++ b/go/vt/servenv/grpc_server_test.go @@ -17,6 +17,8 @@ limitations under the License. package servenv import ( + "fmt" + "net" "testing" "context" @@ -80,7 +82,7 @@ func TestOrcaMetricsRecorder(t *testing.T) { func TestEnableOrcaMetrics(t *testing.T) { // Set the port to enable gRPC server - withTempVar(&gRPCPort, 1000) + withTempVar(&gRPCPort, getFreePort()) withTempVar(&gRPCEnableOrcaMetrics, true) withTempVar(&GRPCServerMetricsRecorder, nil) @@ -105,7 +107,7 @@ func TestEnableOrcaMetrics(t *testing.T) { func TestDisableOrcaMetrics(t *testing.T) { // Set the port to enable gRPC server - withTempVar(&gRPCPort, 10001) + withTempVar(&gRPCPort, getFreePort()) withTempVar(&gRPCEnableOrcaMetrics, false) withTempVar(&GRPCServerMetricsRecorder, nil) @@ -128,7 +130,7 @@ func TestDisableOrcaMetrics(t *testing.T) { func TestReportedOrcaMetrics(t *testing.T) { // Set the port to enable gRPC server - withTempVar(&gRPCPort, 10003) + withTempVar(&gRPCPort, getFreePort()) withTempVar(&gRPCEnableOrcaMetrics, true) withTempVar(&GRPCServerMetricsRecorder, nil) @@ -148,6 +150,15 @@ func TestReportedOrcaMetrics(t *testing.T) { } } +func getFreePort() int { + l, err := net.Listen("tcp", ":0") + if err != nil { + panic(fmt.Sprintf("could not get free port: %v", err)) + } + defer l.Close() + return l.Addr().(*net.TCPAddr).Port +} + func withTempVar[T any](set *T, temp T) (restore func()) { original := *set *set = temp From 269625f8d07efe300626d2ee1775f66c6a078f6b Mon Sep 17 00:00:00 2001 From: twthorn Date: Thu, 22 May 2025 16:27:31 -0700 Subject: [PATCH 06/16] Fix flags test with new orca flag Signed-off-by: twthorn --- go/flags/endtoend/mysqlctld.txt | 1 + go/flags/endtoend/vtcombo.txt | 1 + go/flags/endtoend/vtctld.txt | 1 + go/flags/endtoend/vtgate.txt | 1 + go/flags/endtoend/vtgateclienttest.txt | 1 + go/flags/endtoend/vttablet.txt | 1 + go/flags/endtoend/vttestserver.txt | 1 + 7 files changed, 7 insertions(+) diff --git a/go/flags/endtoend/mysqlctld.txt b/go/flags/endtoend/mysqlctld.txt index 22697762078..ec6d8fe272a 100644 --- a/go/flags/endtoend/mysqlctld.txt +++ b/go/flags/endtoend/mysqlctld.txt @@ -72,6 +72,7 @@ Flags: --grpc-crl string path to a certificate revocation list in PEM format, client certificates will be further verified against this file during TLS handshake --grpc-dial-concurrency-limit int Maximum concurrency of grpc dial operations. This should be less than the golang max thread limit of 10000. (default 1024) --grpc-enable-optional-tls enable optional TLS mode when a server accepts both TLS and plain-text connections on the same port + --grpc-enable-orca-metrics gRPC server option to enable sending ORCA metrics to clients for load balancing --grpc-initial-conn-window-size int gRPC initial connection window size --grpc-initial-window-size int gRPC initial window size --grpc-keepalive-time duration After a duration of this time, if the client doesn't see any activity, it pings the server to see if the transport is still alive. (default 10s) diff --git a/go/flags/endtoend/vtcombo.txt b/go/flags/endtoend/vtcombo.txt index f814649d963..16c5a611b14 100644 --- a/go/flags/endtoend/vtcombo.txt +++ b/go/flags/endtoend/vtcombo.txt @@ -144,6 +144,7 @@ Flags: --grpc-cert string server certificate to use for gRPC connections, requires grpc-key, enables TLS --grpc-crl string path to a certificate revocation list in PEM format, client certificates will be further verified against this file during TLS handshake --grpc-enable-optional-tls enable optional TLS mode when a server accepts both TLS and plain-text connections on the same port + --grpc-enable-orca-metrics gRPC server option to enable sending ORCA metrics to clients for load balancing --grpc-enable-tracing Enable gRPC tracing. --grpc-key string server private key to use for gRPC connections, requires grpc-cert, enables TLS --grpc-max-connection-age duration Maximum age of a client connection before GoAway is sent. (default 2562047h47m16.854775807s) diff --git a/go/flags/endtoend/vtctld.txt b/go/flags/endtoend/vtctld.txt index f29c740fd0c..c14d0ffca95 100644 --- a/go/flags/endtoend/vtctld.txt +++ b/go/flags/endtoend/vtctld.txt @@ -68,6 +68,7 @@ Flags: --grpc-crl string path to a certificate revocation list in PEM format, client certificates will be further verified against this file during TLS handshake --grpc-dial-concurrency-limit int Maximum concurrency of grpc dial operations. This should be less than the golang max thread limit of 10000. (default 1024) --grpc-enable-optional-tls enable optional TLS mode when a server accepts both TLS and plain-text connections on the same port + --grpc-enable-orca-metrics gRPC server option to enable sending ORCA metrics to clients for load balancing --grpc-enable-tracing Enable gRPC tracing. --grpc-initial-conn-window-size int gRPC initial connection window size --grpc-initial-window-size int gRPC initial window size diff --git a/go/flags/endtoend/vtgate.txt b/go/flags/endtoend/vtgate.txt index 3c28131831a..99d582cece1 100644 --- a/go/flags/endtoend/vtgate.txt +++ b/go/flags/endtoend/vtgate.txt @@ -78,6 +78,7 @@ Flags: --grpc-crl string path to a certificate revocation list in PEM format, client certificates will be further verified against this file during TLS handshake --grpc-dial-concurrency-limit int Maximum concurrency of grpc dial operations. This should be less than the golang max thread limit of 10000. (default 1024) --grpc-enable-optional-tls enable optional TLS mode when a server accepts both TLS and plain-text connections on the same port + --grpc-enable-orca-metrics gRPC server option to enable sending ORCA metrics to clients for load balancing --grpc-enable-tracing Enable gRPC tracing. --grpc-initial-conn-window-size int gRPC initial connection window size --grpc-initial-window-size int gRPC initial window size diff --git a/go/flags/endtoend/vtgateclienttest.txt b/go/flags/endtoend/vtgateclienttest.txt index 326406686f8..fccea8eb37d 100644 --- a/go/flags/endtoend/vtgateclienttest.txt +++ b/go/flags/endtoend/vtgateclienttest.txt @@ -25,6 +25,7 @@ Flags: --grpc-crl string path to a certificate revocation list in PEM format, client certificates will be further verified against this file during TLS handshake --grpc-dial-concurrency-limit int Maximum concurrency of grpc dial operations. This should be less than the golang max thread limit of 10000. (default 1024) --grpc-enable-optional-tls enable optional TLS mode when a server accepts both TLS and plain-text connections on the same port + --grpc-enable-orca-metrics gRPC server option to enable sending ORCA metrics to clients for load balancing --grpc-enable-tracing Enable gRPC tracing. --grpc-initial-conn-window-size int gRPC initial connection window size --grpc-initial-window-size int gRPC initial window size diff --git a/go/flags/endtoend/vttablet.txt b/go/flags/endtoend/vttablet.txt index cb0d96a7fcf..ce62282c338 100644 --- a/go/flags/endtoend/vttablet.txt +++ b/go/flags/endtoend/vttablet.txt @@ -173,6 +173,7 @@ Flags: --grpc-crl string path to a certificate revocation list in PEM format, client certificates will be further verified against this file during TLS handshake --grpc-dial-concurrency-limit int Maximum concurrency of grpc dial operations. This should be less than the golang max thread limit of 10000. (default 1024) --grpc-enable-optional-tls enable optional TLS mode when a server accepts both TLS and plain-text connections on the same port + --grpc-enable-orca-metrics gRPC server option to enable sending ORCA metrics to clients for load balancing --grpc-enable-tracing Enable gRPC tracing. --grpc-initial-conn-window-size int gRPC initial connection window size --grpc-initial-window-size int gRPC initial window size diff --git a/go/flags/endtoend/vttestserver.txt b/go/flags/endtoend/vttestserver.txt index eeb48006e30..6d809994929 100644 --- a/go/flags/endtoend/vttestserver.txt +++ b/go/flags/endtoend/vttestserver.txt @@ -54,6 +54,7 @@ Flags: --grpc-crl string path to a certificate revocation list in PEM format, client certificates will be further verified against this file during TLS handshake --grpc-dial-concurrency-limit int Maximum concurrency of grpc dial operations. This should be less than the golang max thread limit of 10000. (default 1024) --grpc-enable-optional-tls enable optional TLS mode when a server accepts both TLS and plain-text connections on the same port + --grpc-enable-orca-metrics gRPC server option to enable sending ORCA metrics to clients for load balancing --grpc-enable-tracing Enable gRPC tracing. --grpc-initial-conn-window-size int gRPC initial connection window size --grpc-initial-window-size int gRPC initial window size From 759b2383cdf2ad4ea528d7978549b4fb8fa9c315 Mon Sep 17 00:00:00 2001 From: twthorn Date: Tue, 27 May 2025 14:47:50 -0700 Subject: [PATCH 07/16] Add periods at end of comments Signed-off-by: twthorn --- go/vt/servenv/grpc_server.go | 6 +++--- go/vt/servenv/grpc_server_test.go | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/go/vt/servenv/grpc_server.go b/go/vt/servenv/grpc_server.go index 599aceb9f79..b922e21f3fb 100644 --- a/go/vt/servenv/grpc_server.go +++ b/go/vt/servenv/grpc_server.go @@ -109,14 +109,14 @@ var ( // there are no active streams, server will send GOAWAY and close the connection. gRPCKeepAliveEnforcementPolicyPermitWithoutStream bool - // Enable ORCA metrics to be sent from the server to the client to be used for load balancing + // Enable ORCA metrics to be sent from the server to the client to be used for load balancing. gRPCEnableOrcaMetrics bool gRPCKeepaliveTime = 10 * time.Second gRPCKeepaliveTimeout = 10 * time.Second ) -// Injectable behavior for testing +// Injectable behavior for testing. var ( registerORCA = orca.Register ) @@ -361,7 +361,7 @@ func registerOrca() { log.Exitf("Failed to register ORCA service: %v", err) } - // Initialize the server metrics values + // Initialize the server metrics values. GRPCServerMetricsRecorder.SetCPUUtilization(getCPUUsage()) GRPCServerMetricsRecorder.SetMemoryUtilization(getMemoryUsage()) diff --git a/go/vt/servenv/grpc_server_test.go b/go/vt/servenv/grpc_server_test.go index ffd7ea2b3e4..c4f5ef0d4d7 100644 --- a/go/vt/servenv/grpc_server_test.go +++ b/go/vt/servenv/grpc_server_test.go @@ -81,7 +81,7 @@ func TestOrcaMetricsRecorder(t *testing.T) { } func TestEnableOrcaMetrics(t *testing.T) { - // Set the port to enable gRPC server + // Set the port to enable gRPC server. withTempVar(&gRPCPort, getFreePort()) withTempVar(&gRPCEnableOrcaMetrics, true) withTempVar(&GRPCServerMetricsRecorder, nil) @@ -106,7 +106,7 @@ func TestEnableOrcaMetrics(t *testing.T) { } func TestDisableOrcaMetrics(t *testing.T) { - // Set the port to enable gRPC server + // Set the port to enable gRPC server. withTempVar(&gRPCPort, getFreePort()) withTempVar(&gRPCEnableOrcaMetrics, false) withTempVar(&GRPCServerMetricsRecorder, nil) @@ -129,7 +129,7 @@ func TestDisableOrcaMetrics(t *testing.T) { } func TestReportedOrcaMetrics(t *testing.T) { - // Set the port to enable gRPC server + // Set the port to enable gRPC server. withTempVar(&gRPCPort, getFreePort()) withTempVar(&gRPCEnableOrcaMetrics, true) withTempVar(&GRPCServerMetricsRecorder, nil) From 15c27c4324f4f50615ea65e46692056eeb2d8071 Mon Sep 17 00:00:00 2001 From: twthorn Date: Tue, 3 Jun 2025 14:47:26 -0700 Subject: [PATCH 08/16] Refactor to support linux & non-linux platforms, and cgroups v1 or v2 Signed-off-by: twthorn --- go.mod | 12 ++ go.sum | 29 +++++ go/vt/servenv/grpc_server.go | 61 +-------- go/vt/servenv/grpc_server_test.go | 8 +- go/vt/servenv/metrics.go | 46 +++++++ go/vt/servenv/metrics_cgroup.go | 171 +++++++++++++++++++++++++ go/vt/servenv/metrics_host.go | 40 ++++++ go/vt/servenv/metrics_host_test.go | 40 ++++++ go/vt/servenv/metrics_linux.go | 36 ++++++ go/vt/servenv/metrics_linux_test.go | 40 ++++++ go/vt/servenv/metrics_nonlinux.go | 40 ++++++ go/vt/servenv/metrics_nonlinux_test.go | 40 ++++++ go/vt/servenv/metrics_test.go | 37 ++++++ 13 files changed, 539 insertions(+), 61 deletions(-) create mode 100644 go/vt/servenv/metrics.go create mode 100644 go/vt/servenv/metrics_cgroup.go create mode 100644 go/vt/servenv/metrics_host.go create mode 100644 go/vt/servenv/metrics_host_test.go create mode 100644 go/vt/servenv/metrics_linux.go create mode 100644 go/vt/servenv/metrics_linux_test.go create mode 100644 go/vt/servenv/metrics_nonlinux.go create mode 100644 go/vt/servenv/metrics_nonlinux_test.go create mode 100644 go/vt/servenv/metrics_test.go diff --git a/go.mod b/go.mod index 82c5794b613..8b938b83314 100644 --- a/go.mod +++ b/go.mod @@ -114,6 +114,16 @@ require ( modernc.org/sqlite v1.37.0 ) +require ( + github.com/cilium/ebpf v0.16.0 // indirect + github.com/containerd/log v0.1.0 // indirect + github.com/docker/go-units v0.5.0 // indirect + github.com/godbus/dbus/v5 v5.1.0 // indirect + github.com/moby/sys/userns v0.1.0 // indirect + github.com/opencontainers/runtime-spec v1.2.0 // indirect + github.com/sirupsen/logrus v1.9.3 // indirect +) + require ( cel.dev/expr v0.24.0 // indirect cloud.google.com/go v0.121.1 // indirect @@ -149,6 +159,8 @@ require ( github.com/aws/aws-sdk-go-v2/service/sts v1.33.19 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/cncf/xds/go v0.0.0-20250501225837-2ac532fd4443 // indirect + github.com/containerd/cgroups v1.1.0 + github.com/containerd/cgroups/v3 v3.0.5 github.com/coreos/go-semver v0.3.1 // indirect github.com/coreos/go-systemd/v22 v22.5.0 // indirect github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect diff --git a/go.sum b/go.sum index 2d252f7e037..bdcde75d2d0 100644 --- a/go.sum +++ b/go.sum @@ -137,6 +137,8 @@ github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/cilium/ebpf v0.16.0 h1:+BiEnHL6Z7lXnlGUsXQPPAE7+kenAd4ES8MQ5min0Ok= +github.com/cilium/ebpf v0.16.0/go.mod h1:L7u2Blt2jMM/vLAVgjxluxtBKlz3/GWjB0dMOEngfwE= github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible/go.mod h1:nmEj6Dob7S7YxXgwXpfOuvO54S+tGdZdw9fuRZt25Ag= github.com/circonus-labs/circonusllhist v0.1.3/go.mod h1:kMXHVDlOchFAehlya5ePtbp5jckzBHf4XRpQvBOLI+I= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= @@ -144,6 +146,12 @@ github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGX github.com/cncf/xds/go v0.0.0-20250501225837-2ac532fd4443 h1:aQ3y1lwWyqYPiWZThqv1aFbZMiM9vblcSArJRf2Irls= github.com/cncf/xds/go v0.0.0-20250501225837-2ac532fd4443/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8= github.com/codegangsta/cli v1.20.0/go.mod h1:/qJNoX69yVSKu5o4jLyXAENLRyk1uhi7zkbQ3slBdOA= +github.com/containerd/cgroups v1.1.0 h1:v8rEWFl6EoqHB+swVNjVoCJE8o3jX7e8nqBGPLaDFBM= +github.com/containerd/cgroups v1.1.0/go.mod h1:6ppBcbh/NOOUU+dMKrykgaBnK9lCIBxHqJDGwsa1mIw= +github.com/containerd/cgroups/v3 v3.0.5 h1:44na7Ud+VwyE7LIoJ8JTNQOa549a8543BmzaJHo6Bzo= +github.com/containerd/cgroups/v3 v3.0.5/go.mod h1:SA5DLYnXO8pTGYiAHXz94qvLQTKfVM5GEVisn4jpins= +github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= +github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo= github.com/coreos/go-semver v0.3.1 h1:yi21YpKnrx1gt5R+la8n5WgS0kCrsPp33dmEyHReZr4= github.com/coreos/go-semver v0.3.1/go.mod h1:irMmmIw/7yzSRPWryHsK7EYSg09caPQL03VsM8rvUec= github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs= @@ -163,6 +171,8 @@ github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8Yc github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2/go.mod h1:SqUrOPUnsFjfmXRMNPybcSiG0BgUW2AuFH8PAnS2iTw= github.com/dgryski/go-farm v0.0.0-20200201041132-a6ae2369ad13 h1:fAjc9m62+UWV/WAFKLNi6ZS0675eEUC9y3AlwSbQu1Y= github.com/dgryski/go-farm v0.0.0-20200201041132-a6ae2369ad13/go.mod h1:SqUrOPUnsFjfmXRMNPybcSiG0BgUW2AuFH8PAnS2iTw= +github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= +github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk= github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= @@ -214,6 +224,8 @@ github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= +github.com/go-quicktest/qt v1.101.0 h1:O1K29Txy5P2OK0dGo59b7b0LR6wKfIhttaAhHUyn7eI= +github.com/go-quicktest/qt v1.101.0/go.mod h1:14Bz/f7NwaXPtdYEgzsx46kqSxVwTbzVZsDC26tQJow= github.com/go-sql-driver/mysql v1.7.1 h1:lUIinVbN1DY0xBg0eMOzmmtGoHwWBbvnWubQUrtU8EI= github.com/go-sql-driver/mysql v1.7.1/go.mod h1:OXbVy3sEdcQ2Doequ6Z5BW6fXNQTmx+9S1MCJN5yJMI= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= @@ -221,6 +233,8 @@ github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0/go.mod h1:fyg78 github.com/go-viper/mapstructure/v2 v2.2.1 h1:ZAaOCxANMuZx5RCeg0mBdEZk7DZasvvZIxtHqx8aGss= github.com/go-viper/mapstructure/v2 v2.2.1/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= +github.com/godbus/dbus/v5 v5.1.0 h1:4KLkAxT3aOY8Li4FRJe/KvhoNFFxo0m6fNuFUO8QJUk= +github.com/godbus/dbus/v5 v5.1.0/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= @@ -333,7 +347,11 @@ github.com/icrowley/fake v0.0.0-20180203215853-4178557ae428 h1:Mo9W14pwbO9VfRe+y github.com/icrowley/fake v0.0.0-20180203215853-4178557ae428/go.mod h1:uhpZMVGznybq1itEKXj6RYw9I71qK4kH+OGMjRC4KEo= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= +github.com/josharian/native v1.1.0 h1:uuaP0hAbW7Y4l0ZRQ6C9zfb7Mg1mbFKry/xzDAfmtLA= +github.com/josharian/native v1.1.0/go.mod h1:7X/raswPFr05uY3HiLlYeyQntB6OO7E/d2Cu7qoaN2w= github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= +github.com/jsimonetti/rtnetlink/v2 v2.0.1 h1:xda7qaHDSVOsADNouv7ukSuicKZO7GgVUCXxpaIEIlM= +github.com/jsimonetti/rtnetlink/v2 v2.0.1/go.mod h1:7MoNYNbb3UaDHtF8udiJo/RH6VsTKP1pqKLUTVCvToE= github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= github.com/json-iterator/go v1.1.9/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= @@ -379,6 +397,10 @@ github.com/mattn/go-runewidth v0.0.9/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc= github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= +github.com/mdlayher/netlink v1.7.2 h1:/UtM3ofJap7Vl4QWCPDGXY8d3GIY2UGSDbK+QWmY8/g= +github.com/mdlayher/netlink v1.7.2/go.mod h1:xraEF7uJbxLhc5fpHL4cPe221LI2bdttWlU+ZGLfQSw= +github.com/mdlayher/socket v0.4.1 h1:eM9y2/jlbs1M615oshPQOHZzj6R6wMT7bX5NPiQvn2U= +github.com/mdlayher/socket v0.4.1/go.mod h1:cAqeGjoufqdxWkD7DkpyS+wcefOtmu5OQ8KuoJGIReA= github.com/miekg/dns v1.1.56 h1:5imZaSeoRNvpM9SzWNhEcP9QliKiz20/dA2QabIGVnE= github.com/miekg/dns v1.1.56/go.mod h1:cRm6Oo2C8TY9ZS/TqsSrseAcncm74lfK5G+ikN2SWWY= github.com/minio/minio-go v0.0.0-20190131015406-c8a261de75c1 h1:jw16EimP5oAEM/2wt+SiEUov/YDyTCTDuPtIKgQIvk0= @@ -387,6 +409,8 @@ github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= github.com/mitchellh/mapstructure v1.5.1-0.20231216201459-8508981c8b6c h1:cqn374mizHuIWj+OSJCajGr/phAmuMug9qIX3l9CflE= github.com/mitchellh/mapstructure v1.5.1-0.20231216201459-8508981c8b6c/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= +github.com/moby/sys/userns v0.1.0 h1:tVLXkFOxVu9A64/yh59slHVv9ahO9UIev4JZusOLG/g= +github.com/moby/sys/userns v0.1.0/go.mod h1:IHUYgu/kao6N8YZlp9Cf444ySSvCmDlmzUcYfDHOl28= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= @@ -415,6 +439,8 @@ github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7J github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo= github.com/onsi/gomega v1.23.0 h1:/oxKu9c2HVap+F3PfKort2Hw5DEU+HGlW8n+tguWsys= github.com/onsi/gomega v1.23.0/go.mod h1:Z/NWtiqwBrwUt4/2loMmHL63EDLnYHmVbuBpDr2vQAg= +github.com/opencontainers/runtime-spec v1.2.0 h1:z97+pHb3uELt/yiAWD691HNHQIF07bE7dzrbT927iTk= +github.com/opencontainers/runtime-spec v1.2.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opentracing-contrib/go-grpc v0.1.2 h1:MP16Ozc59kqqwn1v18aQxpeGZhsBanJ2iurZYaQSZ+g= github.com/opentracing-contrib/go-grpc v0.1.2/go.mod h1:glU6rl1Fhfp9aXUHkE36K2mR4ht8vih0ekOVlWKEUHM= github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= @@ -500,6 +526,8 @@ github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPx github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88= github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= +github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= +github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/sjmudd/stopwatch v0.1.1 h1:x45OvxFB5OtCkjvYtzRF5fWB857Jzjjk84Oyd5C5ebw= github.com/sjmudd/stopwatch v0.1.1/go.mod h1:BLw0oIQJ1YLXBO/q9ufK/SgnKBVIkC2qrm6uy78Zw6U= github.com/smartystreets/assertions v0.0.0-20190116191733-b6c0e53d7304/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= @@ -708,6 +736,7 @@ golang.org/x/sys v0.0.0-20210927094055-39ccf1dd6fa6/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220503163025-988cb79eb6c6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220627191245-f75cf1eec38b/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= diff --git a/go/vt/servenv/grpc_server.go b/go/vt/servenv/grpc_server.go index b922e21f3fb..604daa88e3d 100644 --- a/go/vt/servenv/grpc_server.go +++ b/go/vt/servenv/grpc_server.go @@ -21,15 +21,11 @@ import ( "crypto/tls" "math" "net" - "runtime" "strconv" "time" grpc_middleware "github.com/grpc-ecosystem/go-grpc-middleware" grpc_prometheus "github.com/grpc-ecosystem/go-grpc-prometheus" - "github.com/shirou/gopsutil/v4/cpu" - "github.com/shirou/gopsutil/v4/docker" - "github.com/shirou/gopsutil/v4/mem" "github.com/spf13/pflag" "google.golang.org/grpc" "google.golang.org/grpc/credentials" @@ -362,72 +358,19 @@ func registerOrca() { } // Initialize the server metrics values. - GRPCServerMetricsRecorder.SetCPUUtilization(getCPUUsage()) + GRPCServerMetricsRecorder.SetCPUUtilization(getCpuUsage()) GRPCServerMetricsRecorder.SetMemoryUtilization(getMemoryUsage()) go func() { ticker := time.NewTicker(30 * time.Second) defer ticker.Stop() for range ticker.C { - GRPCServerMetricsRecorder.SetCPUUtilization(getCPUUsage()) + GRPCServerMetricsRecorder.SetCPUUtilization(getCpuUsage()) GRPCServerMetricsRecorder.SetMemoryUtilization(getMemoryUsage()) } }() } -func getCPUUsage() float64 { - if isLinux() { - if cpuUsage := getCGroupCpuUsage(); cpuUsage != -1 { - return cpuUsage - } - } - - percentages, err := cpu.Percent(0, false) - if err != nil || len(percentages) == 0 { - return 0 - } - return percentages[0] / 100.0 -} - -func getMemoryUsage() float64 { - if isLinux() { - if stats, err := docker.CgroupMem("self", "/sys/fs/cgroup"); err == nil && stats != nil { - return float64(stats.MemUsageInBytes) / float64(stats.MemLimitInBytes) - } - } - - vmStat, err := mem.VirtualMemory() - if err != nil { - return 0 - } - return vmStat.UsedPercent / 100.0 -} - -func isLinux() bool { - return runtime.GOOS == "linux" -} - -func getCGroupCpuUsage() float64 { - const interval = 100 * time.Millisecond - - start, err := docker.CgroupCPU("self", "/sys/fs/cgroup") - if err != nil || start == nil { - return -1 - } - - time.Sleep(interval) - - end, err := docker.CgroupCPU("self", "/sys/fs/cgroup") - if err != nil || end == nil { - return -1 - } - - delta := end.Usage - start.Usage // in nanoseconds - numCPU := float64(runtime.NumCPU()) - usage := float64(delta) / (float64(interval.Nanoseconds()) * numCPU) - return usage -} - // GRPCCheckServiceMap returns if we should register a gRPC service // (and also logs how to enable / disable it) func GRPCCheckServiceMap(name string) bool { diff --git a/go/vt/servenv/grpc_server_test.go b/go/vt/servenv/grpc_server_test.go index c4f5ef0d4d7..31a4cdbba85 100644 --- a/go/vt/servenv/grpc_server_test.go +++ b/go/vt/servenv/grpc_server_test.go @@ -141,13 +141,17 @@ func TestReportedOrcaMetrics(t *testing.T) { serveGRPC() serverMetrics := GRPCServerMetricsRecorder.ServerMetrics() - if cpuUsage := serverMetrics.CPUUtilization; cpuUsage < 0 { + cpuUsage := serverMetrics.CPUUtilization + if cpuUsage < 0 { t.Errorf("CPU Utilization is not set %.2f", cpuUsage) } + t.Logf("CPU Utilization is %.2f", cpuUsage) - if memUsage := serverMetrics.MemUtilization; memUsage < 0 { + memUsage := serverMetrics.MemUtilization + if memUsage < 0 { t.Errorf("Mem Utilization is not set %.2f", memUsage) } + t.Logf("Memory utilization is %.2f", memUsage) } func getFreePort() int { diff --git a/go/vt/servenv/metrics.go b/go/vt/servenv/metrics.go new file mode 100644 index 00000000000..b11681107f3 --- /dev/null +++ b/go/vt/servenv/metrics.go @@ -0,0 +1,46 @@ +/* +Copyright 2025 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package servenv + +import "time" + +var cpuSampleInterval time.Duration = 2 * time.Second + +func getCpuUsage() float64 { + cpuSampleInterval = 1 * time.Second + value, err := getCgroupCpu() + if err == nil { + return value + } + value, err = getHostCpu() + if err == nil { + return value + } + return -1 +} + +func getMemoryUsage() float64 { + value, err := getCgroupMemory() + if err == nil { + return value + } + value, err = getHostMemory() + if err == nil { + return value + } + return -1 +} diff --git a/go/vt/servenv/metrics_cgroup.go b/go/vt/servenv/metrics_cgroup.go new file mode 100644 index 00000000000..05cbe7f507a --- /dev/null +++ b/go/vt/servenv/metrics_cgroup.go @@ -0,0 +1,171 @@ +//go:build linux +// +build linux + +/* +Copyright 2025 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package servenv + +import ( + "fmt" + "log" + "runtime" + "time" + + "github.com/containerd/cgroups" + "github.com/containerd/cgroups/v3/cgroup1" + "github.com/containerd/cgroups/v3/cgroup2" +) + +var ( + cgroup2Manager *cgroup2.Manager + cgroup1Manager cgroup1.Cgroup +) + +func init() { + if cgroups.Mode() == cgroups.Unified { + manager, err := getCGroup2() + if err != nil { + log.Printf("Failed to load cgroup2 manager: %v", err) + return + } + cgroup2Manager = manager + } else { + cgroup, err := getCGroup1() + if err != nil { + log.Printf("Failed to load cgroup1 manager: %v", err) + return + } + cgroup1Manager = cgroup + } +} + +func isCgroupV2() bool { + return cgroups.Mode() == cgroups.Unified +} + +func getCGroup1() (cgroup1.Cgroup, error) { + path := cgroup1.NestedPath("") + cgroup, err := cgroup1.Load(path) + if err != nil { + return nil, fmt.Errorf("cgroup1 manager is nil") + } + return cgroup, nil +} + +func getCGroup2() (*cgroup2.Manager, error) { + path, err := cgroup2.NestedGroupPath("") + if err != nil { + return nil, fmt.Errorf("failed to load cgroup2 manager: %w", err) + } + cgroupManager, err := cgroup2.Load(path) + if err != nil { + return nil, fmt.Errorf("cgroup2 manager is nil") + } + return cgroupManager, nil +} + +func getCgroupCpuUsage(interval time.Duration) (float64, error) { + if isCgroupV2() { + return getCgroup2CpuUsage(interval) + } else { + return getCgroup1CpuUsage(interval) + } +} + +func getCgroupMemoryUsage() (float64, error) { + if isCgroupV2() { + return getCgroup2MemoryUsage() + } else { + return getCgroup1MemoryUsage() + } +} + +func getCgroup1CpuUsage(interval time.Duration) (float64, error) { + stat1, err := cgroup1Manager.Stat() + if err != nil { + return -1, fmt.Errorf("failed to get initial CPU stat: %w", err) + } + usage1 := stat1.CPU.Usage.Total + + time.Sleep(interval) + + stat2, err := cgroup1Manager.Stat() + if err != nil { + return -1, fmt.Errorf("failed to get second CPU stat: %w", err) + } + usage2 := stat2.CPU.Usage.Total + + return getCpuUsageFromSamples(usage1, usage2, interval) +} + +func getCpuUsageFromSamples(usage1 uint64, usage2 uint64, interval time.Duration) (float64, error) { + if usage1 == 0 && usage2 == 0 { + return -1, fmt.Errorf("CPU usage for both samples is zero") + } + + deltaUsage := usage2 - usage1 + deltaTime := float64(interval.Microseconds()) + + cpuCount := float64(runtime.NumCPU()) + cpuUsage := (float64(deltaUsage) / deltaTime) / cpuCount + + return cpuUsage, nil +} + +func getCgroup1MemoryUsage() (float64, error) { + stats, err := cgroup1Manager.Stat() + if err != nil { + return -1, fmt.Errorf("failed to get cgroup2 stats: %w", err) + } + usage := stats.Memory.Usage.Usage + limit := stats.Memory.Usage.Limit + if limit == 0 || limit == ^uint64(0) { + return -1, fmt.Errorf("Failed to compute memory usage with invalid limit: %d", limit) + } + return float64(usage) / float64(limit), nil +} + +func getCgroup2CpuUsage(interval time.Duration) (float64, error) { + stat1, err := cgroup2Manager.Stat() + if err != nil { + return -1, fmt.Errorf("failed to get initial CPU stat: %w", err) + } + usage1 := stat1.CPU.UsageUsec + + time.Sleep(interval) + + stat2, err := cgroup2Manager.Stat() + if err != nil { + return -1, fmt.Errorf("failed to get second CPU stat: %w", err) + } + usage2 := stat2.CPU.UsageUsec + + return getCpuUsageFromSamples(usage1, usage2, interval) +} + +func getCgroup2MemoryUsage() (float64, error) { + stats, err := cgroup2Manager.Stat() + if err != nil { + return -1, fmt.Errorf("failed to get cgroup2 stats: %w", err) + } + usage := stats.Memory.Usage + limit := stats.Memory.UsageLimit + if limit == 0 || limit == ^uint64(0) { + return -1, fmt.Errorf("Failed to compute memory usage with invalid limit: %d", limit) + } + return float64(usage) / float64(limit), nil +} diff --git a/go/vt/servenv/metrics_host.go b/go/vt/servenv/metrics_host.go new file mode 100644 index 00000000000..56d02c2f2cc --- /dev/null +++ b/go/vt/servenv/metrics_host.go @@ -0,0 +1,40 @@ +/* +Copyright 2025 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package servenv + +import ( + "fmt" + + "github.com/shirou/gopsutil/v4/cpu" + "github.com/shirou/gopsutil/v4/mem" +) + +func getHostCpuUsage() (float64, error) { + percentages, err := cpu.Percent(cpuSampleInterval, true) + if err != nil || len(percentages) == 0 { + return -1, fmt.Errorf("Failed to get cpu usage %v", err) + } + return percentages[0] / 100.0, nil +} + +func getHostMemoryUsage() (float64, error) { + vmStat, err := mem.VirtualMemory() + if err != nil { + return -1, fmt.Errorf("Failed to get memory usage %v", err) + } + return vmStat.UsedPercent / 100.0, nil +} diff --git a/go/vt/servenv/metrics_host_test.go b/go/vt/servenv/metrics_host_test.go new file mode 100644 index 00000000000..b6981ad977f --- /dev/null +++ b/go/vt/servenv/metrics_host_test.go @@ -0,0 +1,40 @@ +//go:build !linux +// +build !linux + +/* +Copyright 2025 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package servenv + +import ( + "testing" +) + +func TestReportCpu(t *testing.T) { + cpuUsage, err := getHostCpuUsage() + if err != nil { + t.Errorf("Error reading CPU: %v, value %.2f", err, cpuUsage) + } + t.Logf("CPU Utilization is %.2f", cpuUsage) +} + +func TestReportMemory(t *testing.T) { + memoryUsage, err := getHostMemoryUsage() + if err != nil { + t.Errorf("Error reading memory: %v, value %.2f", err, memoryUsage) + } + t.Logf("Memory Utilization is %.2f", memoryUsage) +} diff --git a/go/vt/servenv/metrics_linux.go b/go/vt/servenv/metrics_linux.go new file mode 100644 index 00000000000..dc56fd5e108 --- /dev/null +++ b/go/vt/servenv/metrics_linux.go @@ -0,0 +1,36 @@ +//go:build linux +// +build linux + +/* +Copyright 2025 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package servenv + +func getHostCpu() (float64, error) { + return getHostCpuUsage() +} + +func getHostMemory() (float64, error) { + return getHostMemoryUsage() +} + +func getCgroupCpu() (float64, error) { + return getCgroupCpuUsage(cpuSampleInterval) +} + +func getCgroupMemory() (float64, error) { + return getCgroupMemoryUsage() +} diff --git a/go/vt/servenv/metrics_linux_test.go b/go/vt/servenv/metrics_linux_test.go new file mode 100644 index 00000000000..09198ce5fe8 --- /dev/null +++ b/go/vt/servenv/metrics_linux_test.go @@ -0,0 +1,40 @@ +//go:build linux +// +build linux + +/* +Copyright 2025 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package servenv + +import ( + "testing" +) + +func TestGetCpu(t *testing.T) { + cpuUsage, err := getHostCpuUsage() + if err != nil || cpuUsage < 0 { + t.Errorf("Error reading CPU: %v, value %.2f", err, cpuUsage) + } + t.Logf("CPU Utilization is %.2f", cpuUsage) +} + +func TestGetMemory(t *testing.T) { + memoryUsage, err := getHostMemoryUsage() + if err != nil || memoryUsage < 0 || memoryUsage > 1 { + t.Errorf("Error reading memory: %v, value %.2f", err, memoryUsage) + } + t.Logf("Memory Utilization is %.2f", memoryUsage) +} diff --git a/go/vt/servenv/metrics_nonlinux.go b/go/vt/servenv/metrics_nonlinux.go new file mode 100644 index 00000000000..387b8eaba70 --- /dev/null +++ b/go/vt/servenv/metrics_nonlinux.go @@ -0,0 +1,40 @@ +//go:build !linux +// +build !linux + +/* +Copyright 2025 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package servenv + +import ( + "fmt" +) + +func getHostCpu() (float64, error) { + return getHostCpuUsage() +} + +func getHostMemory() (float64, error) { + return getHostMemoryUsage() +} + +func getCgroupCpu() (float64, error) { + return -1, fmt.Errorf("CGroup not supported on nonlinux platform") +} + +func getCgroupMemory() (float64, error) { + return -1, fmt.Errorf("CGroup not supported on nonlinux platform") +} diff --git a/go/vt/servenv/metrics_nonlinux_test.go b/go/vt/servenv/metrics_nonlinux_test.go new file mode 100644 index 00000000000..508e32ddab1 --- /dev/null +++ b/go/vt/servenv/metrics_nonlinux_test.go @@ -0,0 +1,40 @@ +//go:build !linux +// +build !linux + +/* +Copyright 2025 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package servenv + +import ( + "testing" +) + +func TestGetCpu(t *testing.T) { + cpuUsage, err := getHostCpuUsage() + if err != nil || cpuUsage < 0 { + t.Errorf("Error reading CPU: %v, value %.2f", err, cpuUsage) + } + t.Logf("CPU Utilization is %.2f", cpuUsage) +} + +func TestGetMemory(t *testing.T) { + memoryUsage, err := getHostMemoryUsage() + if err != nil || memoryUsage < 0 || memoryUsage > 1 { + t.Errorf("Error reading memory: %v, value %.2f", err, memoryUsage) + } + t.Logf("Memory Utilization is %.2f", memoryUsage) +} diff --git a/go/vt/servenv/metrics_test.go b/go/vt/servenv/metrics_test.go new file mode 100644 index 00000000000..79dc742b23d --- /dev/null +++ b/go/vt/servenv/metrics_test.go @@ -0,0 +1,37 @@ +/* +Copyright 2025 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package servenv + +import ( + "testing" +) + +func TestGetCpuUsage(t *testing.T) { + value := getCpuUsage() + t.Logf("CPU usage %v", value) + if value < 0 || value > 1 { + t.Errorf("Got invalid CPU value %v", value) + } +} + +func TestGetMemoryUsage(t *testing.T) { + value := getMemoryUsage() + t.Logf("Memory usage %v", value) + if value < 0 || value > 1 { + t.Errorf("Got invalid memory value %v", value) + } +} From 2c8dd3afc0d65a9ad7b3a47cc11cca3309890f81 Mon Sep 17 00:00:00 2001 From: twthorn Date: Tue, 3 Jun 2025 15:31:10 -0700 Subject: [PATCH 09/16] Avoid sleep interval, use last sampled CPU Signed-off-by: twthorn --- go/vt/servenv/metrics.go | 5 -- go/vt/servenv/metrics_cgroup.go | 80 ++++++++++++++++------------- go/vt/servenv/metrics_host.go | 2 +- go/vt/servenv/metrics_linux.go | 2 +- go/vt/servenv/metrics_linux_test.go | 2 + 5 files changed, 48 insertions(+), 43 deletions(-) diff --git a/go/vt/servenv/metrics.go b/go/vt/servenv/metrics.go index b11681107f3..79e39b1f45f 100644 --- a/go/vt/servenv/metrics.go +++ b/go/vt/servenv/metrics.go @@ -16,12 +16,7 @@ limitations under the License. package servenv -import "time" - -var cpuSampleInterval time.Duration = 2 * time.Second - func getCpuUsage() float64 { - cpuSampleInterval = 1 * time.Second value, err := getCgroupCpu() if err == nil { return value diff --git a/go/vt/servenv/metrics_cgroup.go b/go/vt/servenv/metrics_cgroup.go index 05cbe7f507a..2c149b1c536 100644 --- a/go/vt/servenv/metrics_cgroup.go +++ b/go/vt/servenv/metrics_cgroup.go @@ -21,36 +21,46 @@ package servenv import ( "fmt" - "log" "runtime" "time" "github.com/containerd/cgroups" "github.com/containerd/cgroups/v3/cgroup1" "github.com/containerd/cgroups/v3/cgroup2" + + "vitess.io/vitess/go/vt/log" ) var ( cgroup2Manager *cgroup2.Manager cgroup1Manager cgroup1.Cgroup + lastCpu uint64 + lastTime time.Time ) func init() { if cgroups.Mode() == cgroups.Unified { manager, err := getCGroup2() if err != nil { - log.Printf("Failed to load cgroup2 manager: %v", err) - return + log.Errorf("Failed to init cgroup2 manager: %v", err) } cgroup2Manager = manager + lastCpu, err = getCgroup2CpuUsage() + if err != nil { + log.Errorf("Failed to init cgroup2 cpu %v", err) + } } else { cgroup, err := getCGroup1() if err != nil { - log.Printf("Failed to load cgroup1 manager: %v", err) - return + log.Errorf("Failed to init cgroup1 manager: %v", err) } cgroup1Manager = cgroup + lastCpu, err = getCgroup2CpuUsage() + if err != nil { + log.Errorf("Failed to init cgroup1 cpu %v", err) + } } + lastTime = time.Now() } func isCgroupV2() bool { @@ -78,12 +88,28 @@ func getCGroup2() (*cgroup2.Manager, error) { return cgroupManager, nil } -func getCgroupCpuUsage(interval time.Duration) (float64, error) { +func getCgroupCpuUsage() (float64, error) { + var ( + currentUsage uint64 + err error + ) + currentTime := time.Now() if isCgroupV2() { - return getCgroup2CpuUsage(interval) + currentUsage, err = getCgroup2CpuUsage() } else { - return getCgroup1CpuUsage(interval) + currentUsage, err = getCgroup1CpuUsage() + } + if err != nil { + return -1, fmt.Errorf("Could not read cpu usage") + } + duration := currentTime.Sub(lastTime) + usage, err := getCpuUsageFromSamples(lastCpu, currentUsage, duration) + if err != nil { + return -1, err } + lastCpu = currentUsage + lastTime = currentTime + return usage, nil } func getCgroupMemoryUsage() (float64, error) { @@ -94,22 +120,22 @@ func getCgroupMemoryUsage() (float64, error) { } } -func getCgroup1CpuUsage(interval time.Duration) (float64, error) { +func getCgroup1CpuUsage() (uint64, error) { stat1, err := cgroup1Manager.Stat() if err != nil { - return -1, fmt.Errorf("failed to get initial CPU stat: %w", err) + return 0, fmt.Errorf("failed to get initial CPU stat: %w", err) } - usage1 := stat1.CPU.Usage.Total - - time.Sleep(interval) + currentUsage := stat1.CPU.Usage.Total + return currentUsage, nil +} - stat2, err := cgroup1Manager.Stat() +func getCgroup2CpuUsage() (uint64, error) { + stat1, err := cgroup2Manager.Stat() if err != nil { - return -1, fmt.Errorf("failed to get second CPU stat: %w", err) + return 0, fmt.Errorf("failed to get initial CPU stat: %w", err) } - usage2 := stat2.CPU.Usage.Total - - return getCpuUsageFromSamples(usage1, usage2, interval) + currentUsage := stat1.CPU.UsageUsec + return currentUsage, nil } func getCpuUsageFromSamples(usage1 uint64, usage2 uint64, interval time.Duration) (float64, error) { @@ -139,24 +165,6 @@ func getCgroup1MemoryUsage() (float64, error) { return float64(usage) / float64(limit), nil } -func getCgroup2CpuUsage(interval time.Duration) (float64, error) { - stat1, err := cgroup2Manager.Stat() - if err != nil { - return -1, fmt.Errorf("failed to get initial CPU stat: %w", err) - } - usage1 := stat1.CPU.UsageUsec - - time.Sleep(interval) - - stat2, err := cgroup2Manager.Stat() - if err != nil { - return -1, fmt.Errorf("failed to get second CPU stat: %w", err) - } - usage2 := stat2.CPU.UsageUsec - - return getCpuUsageFromSamples(usage1, usage2, interval) -} - func getCgroup2MemoryUsage() (float64, error) { stats, err := cgroup2Manager.Stat() if err != nil { diff --git a/go/vt/servenv/metrics_host.go b/go/vt/servenv/metrics_host.go index 56d02c2f2cc..9f4b213711b 100644 --- a/go/vt/servenv/metrics_host.go +++ b/go/vt/servenv/metrics_host.go @@ -24,7 +24,7 @@ import ( ) func getHostCpuUsage() (float64, error) { - percentages, err := cpu.Percent(cpuSampleInterval, true) + percentages, err := cpu.Percent(0, false) if err != nil || len(percentages) == 0 { return -1, fmt.Errorf("Failed to get cpu usage %v", err) } diff --git a/go/vt/servenv/metrics_linux.go b/go/vt/servenv/metrics_linux.go index dc56fd5e108..68dbd24b423 100644 --- a/go/vt/servenv/metrics_linux.go +++ b/go/vt/servenv/metrics_linux.go @@ -28,7 +28,7 @@ func getHostMemory() (float64, error) { } func getCgroupCpu() (float64, error) { - return getCgroupCpuUsage(cpuSampleInterval) + return getCgroupCpuUsage() } func getCgroupMemory() (float64, error) { diff --git a/go/vt/servenv/metrics_linux_test.go b/go/vt/servenv/metrics_linux_test.go index 09198ce5fe8..765c6f55e08 100644 --- a/go/vt/servenv/metrics_linux_test.go +++ b/go/vt/servenv/metrics_linux_test.go @@ -21,9 +21,11 @@ package servenv import ( "testing" + "time" ) func TestGetCpu(t *testing.T) { + time.Sleep(2 * time.Second) cpuUsage, err := getHostCpuUsage() if err != nil || cpuUsage < 0 { t.Errorf("Error reading CPU: %v, value %.2f", err, cpuUsage) From 1385cc8fbe6e7ce6aa6b62ee58d609de809d8968 Mon Sep 17 00:00:00 2001 From: twthorn Date: Wed, 4 Jun 2025 10:59:43 -0700 Subject: [PATCH 10/16] Fix cpu to use c group 1, remove unnecessary build constraint, use consistent naming case Signed-off-by: twthorn --- go/vt/servenv/metrics_cgroup.go | 10 +++++----- go/vt/servenv/metrics_host_test.go | 3 --- go/vt/servenv/metrics_nonlinux.go | 4 ++-- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/go/vt/servenv/metrics_cgroup.go b/go/vt/servenv/metrics_cgroup.go index 2c149b1c536..3bb1ddf4590 100644 --- a/go/vt/servenv/metrics_cgroup.go +++ b/go/vt/servenv/metrics_cgroup.go @@ -40,7 +40,7 @@ var ( func init() { if cgroups.Mode() == cgroups.Unified { - manager, err := getCGroup2() + manager, err := getCgroup2() if err != nil { log.Errorf("Failed to init cgroup2 manager: %v", err) } @@ -50,12 +50,12 @@ func init() { log.Errorf("Failed to init cgroup2 cpu %v", err) } } else { - cgroup, err := getCGroup1() + cgroup, err := getCgroup1() if err != nil { log.Errorf("Failed to init cgroup1 manager: %v", err) } cgroup1Manager = cgroup - lastCpu, err = getCgroup2CpuUsage() + lastCpu, err = getCgroup1CpuUsage() if err != nil { log.Errorf("Failed to init cgroup1 cpu %v", err) } @@ -67,7 +67,7 @@ func isCgroupV2() bool { return cgroups.Mode() == cgroups.Unified } -func getCGroup1() (cgroup1.Cgroup, error) { +func getCgroup1() (cgroup1.Cgroup, error) { path := cgroup1.NestedPath("") cgroup, err := cgroup1.Load(path) if err != nil { @@ -76,7 +76,7 @@ func getCGroup1() (cgroup1.Cgroup, error) { return cgroup, nil } -func getCGroup2() (*cgroup2.Manager, error) { +func getCgroup2() (*cgroup2.Manager, error) { path, err := cgroup2.NestedGroupPath("") if err != nil { return nil, fmt.Errorf("failed to load cgroup2 manager: %w", err) diff --git a/go/vt/servenv/metrics_host_test.go b/go/vt/servenv/metrics_host_test.go index b6981ad977f..447af3329b3 100644 --- a/go/vt/servenv/metrics_host_test.go +++ b/go/vt/servenv/metrics_host_test.go @@ -1,6 +1,3 @@ -//go:build !linux -// +build !linux - /* Copyright 2025 The Vitess Authors. diff --git a/go/vt/servenv/metrics_nonlinux.go b/go/vt/servenv/metrics_nonlinux.go index 387b8eaba70..07a9fbe515d 100644 --- a/go/vt/servenv/metrics_nonlinux.go +++ b/go/vt/servenv/metrics_nonlinux.go @@ -32,9 +32,9 @@ func getHostMemory() (float64, error) { } func getCgroupCpu() (float64, error) { - return -1, fmt.Errorf("CGroup not supported on nonlinux platform") + return -1, fmt.Errorf("Cgroup not supported on nonlinux platform") } func getCgroupMemory() (float64, error) { - return -1, fmt.Errorf("CGroup not supported on nonlinux platform") + return -1, fmt.Errorf("Cgroup not supported on nonlinux platform") } From 06de694ed82fdb505a7725cb3814fdb681558024 Mon Sep 17 00:00:00 2001 From: twthorn Date: Wed, 4 Jun 2025 18:11:47 -0700 Subject: [PATCH 11/16] Add docker support for running cgroups test, refactor tests Signed-off-by: twthorn --- docker/test/Dockerfile.metrics | 17 +++++++ .../metrics/docker_metrics_cgroup_test.go | 49 +++++++++++++++++++ go/vt/servenv/grpc_server_test.go | 8 +-- go/vt/servenv/metrics_cgroup.go | 23 +++++++-- go/vt/servenv/metrics_cgroup_test.go | 37 ++++++++++++++ go/vt/servenv/metrics_host_test.go | 17 +++---- go/vt/servenv/metrics_linux_test.go | 15 ++---- go/vt/servenv/metrics_nonlinux_test.go | 13 ++--- go/vt/servenv/metrics_test.go | 37 +++++++++++--- 9 files changed, 171 insertions(+), 45 deletions(-) create mode 100644 docker/test/Dockerfile.metrics create mode 100644 go/test/endtoend/metrics/docker_metrics_cgroup_test.go create mode 100644 go/vt/servenv/metrics_cgroup_test.go diff --git a/docker/test/Dockerfile.metrics b/docker/test/Dockerfile.metrics new file mode 100644 index 00000000000..c655ec59885 --- /dev/null +++ b/docker/test/Dockerfile.metrics @@ -0,0 +1,17 @@ +# Use the official Go image as a base image +FROM golang:1.24.3 + +# Set the working directory inside the container +WORKDIR /app + +# Copy the go mod and sum files +COPY go.mod go.sum ./ + +# Download all dependencies +RUN go mod download + +# Copy the source from the current directory to the working directory inside the container +COPY . . + +# Command to run the tests +CMD ["go", "test", "-count=1", "-timeout=60s", "-run", "Metrics", "vitess.io/vitess/go/vt/servenv", "-v"] diff --git a/go/test/endtoend/metrics/docker_metrics_cgroup_test.go b/go/test/endtoend/metrics/docker_metrics_cgroup_test.go new file mode 100644 index 00000000000..bb52183b84a --- /dev/null +++ b/go/test/endtoend/metrics/docker_metrics_cgroup_test.go @@ -0,0 +1,49 @@ +/* +Copyright 2025 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package servenv + +import ( + "fmt" + "os" + "os/exec" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestMain(m *testing.M) { + fmt.Println("Building test Docker image...") + build := exec.Command("docker", "build", "-f", "docker/test/Dockerfile.metrics", "-t", "metrics:test", ".") + build.Dir = "../../../.." + build.Stdout = os.Stdout + build.Stderr = os.Stderr + if err := build.Run(); err != nil { + fmt.Fprintf(os.Stderr, "Failed to build Docker image: %v\n", err) + os.Exit(1) + } + + os.Exit(m.Run()) +} + +func TestReportCpuInDocker(t *testing.T) { + cmd := exec.Command("docker", "run", "--rm", "metrics:test") + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + + err := cmd.Run() + require.NoError(t, err, "Docker test container failed") +} diff --git a/go/vt/servenv/grpc_server_test.go b/go/vt/servenv/grpc_server_test.go index 31a4cdbba85..a93a7a30b3e 100644 --- a/go/vt/servenv/grpc_server_test.go +++ b/go/vt/servenv/grpc_server_test.go @@ -64,7 +64,7 @@ func TestDoubleInterceptor(t *testing.T) { } } -func TestOrcaMetricsRecorder(t *testing.T) { +func TestOrcaRecorder(t *testing.T) { recorder := orca.NewServerMetricsRecorder() recorder.SetCPUUtilization(0.25) @@ -80,7 +80,7 @@ func TestOrcaMetricsRecorder(t *testing.T) { } } -func TestEnableOrcaMetrics(t *testing.T) { +func TestEnableOrca(t *testing.T) { // Set the port to enable gRPC server. withTempVar(&gRPCPort, getFreePort()) withTempVar(&gRPCEnableOrcaMetrics, true) @@ -105,7 +105,7 @@ func TestEnableOrcaMetrics(t *testing.T) { } } -func TestDisableOrcaMetrics(t *testing.T) { +func TestDisableOrca(t *testing.T) { // Set the port to enable gRPC server. withTempVar(&gRPCPort, getFreePort()) withTempVar(&gRPCEnableOrcaMetrics, false) @@ -128,7 +128,7 @@ func TestDisableOrcaMetrics(t *testing.T) { } } -func TestReportedOrcaMetrics(t *testing.T) { +func TestReportedOrca(t *testing.T) { // Set the port to enable gRPC server. withTempVar(&gRPCPort, getFreePort()) withTempVar(&gRPCEnableOrcaMetrics, true) diff --git a/go/vt/servenv/metrics_cgroup.go b/go/vt/servenv/metrics_cgroup.go index 3bb1ddf4590..ae9e8bfd07d 100644 --- a/go/vt/servenv/metrics_cgroup.go +++ b/go/vt/servenv/metrics_cgroup.go @@ -21,12 +21,14 @@ package servenv import ( "fmt" + "math" "runtime" "time" "github.com/containerd/cgroups" "github.com/containerd/cgroups/v3/cgroup1" "github.com/containerd/cgroups/v3/cgroup2" + "github.com/shirou/gopsutil/v4/mem" "vitess.io/vitess/go/vt/log" ) @@ -159,10 +161,7 @@ func getCgroup1MemoryUsage() (float64, error) { } usage := stats.Memory.Usage.Usage limit := stats.Memory.Usage.Limit - if limit == 0 || limit == ^uint64(0) { - return -1, fmt.Errorf("Failed to compute memory usage with invalid limit: %d", limit) - } - return float64(usage) / float64(limit), nil + return computeMemoryUsage(usage, limit) } func getCgroup2MemoryUsage() (float64, error) { @@ -172,8 +171,22 @@ func getCgroup2MemoryUsage() (float64, error) { } usage := stats.Memory.Usage limit := stats.Memory.UsageLimit - if limit == 0 || limit == ^uint64(0) { + return computeMemoryUsage(usage, limit) +} + +func computeMemoryUsage(usage uint64, limit uint64) (float64, error) { + if usage == 0 || usage == math.MaxUint64 { + return -1, fmt.Errorf("Failed to find memory usage with invalid value: %d", usage) + } + if limit == 0 { return -1, fmt.Errorf("Failed to compute memory usage with invalid limit: %d", limit) } + if limit == math.MaxUint64 { + vmem, err := mem.VirtualMemory() + if err != nil { + return -1, fmt.Errorf("Failed to fall back to system max memory: %w", err) + } + limit = vmem.Total + } return float64(usage) / float64(limit), nil } diff --git a/go/vt/servenv/metrics_cgroup_test.go b/go/vt/servenv/metrics_cgroup_test.go new file mode 100644 index 00000000000..66069d1fcaf --- /dev/null +++ b/go/vt/servenv/metrics_cgroup_test.go @@ -0,0 +1,37 @@ +//go:build linux +// +build linux + +/* +Copyright 2025 The Vitess Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package servenv + +import ( + "testing" +) + +func TestGetCGroupCpuUsageMetrics(t *testing.T) { + sleepBeforeCpuSample() + cpu, err := getCgroupCpuUsage() + validateCpu(t, cpu, err) + t.Logf("cpu %.5f", cpu) +} + +func TestGetCgroupMemoryUsageMetrics(t *testing.T) { + mem, err := getCgroupMemoryUsage() + validateMem(t, mem, err) + t.Logf("mem %.5f", mem) +} diff --git a/go/vt/servenv/metrics_host_test.go b/go/vt/servenv/metrics_host_test.go index 447af3329b3..91d7c989097 100644 --- a/go/vt/servenv/metrics_host_test.go +++ b/go/vt/servenv/metrics_host_test.go @@ -20,18 +20,15 @@ import ( "testing" ) -func TestReportCpu(t *testing.T) { +func TestReportCpuMetrics(t *testing.T) { + sleepBeforeCpuSample() cpuUsage, err := getHostCpuUsage() - if err != nil { - t.Errorf("Error reading CPU: %v, value %.2f", err, cpuUsage) - } - t.Logf("CPU Utilization is %.2f", cpuUsage) + validateCpu(t, cpuUsage, err) + t.Logf("CPU Utilization is %.10f", cpuUsage) } -func TestReportMemory(t *testing.T) { +func TestReportMemoryMetrics(t *testing.T) { memoryUsage, err := getHostMemoryUsage() - if err != nil { - t.Errorf("Error reading memory: %v, value %.2f", err, memoryUsage) - } - t.Logf("Memory Utilization is %.2f", memoryUsage) + validateMem(t, memoryUsage, err) + t.Logf("Memory Utilization is %.10f", memoryUsage) } diff --git a/go/vt/servenv/metrics_linux_test.go b/go/vt/servenv/metrics_linux_test.go index 765c6f55e08..471fbbec955 100644 --- a/go/vt/servenv/metrics_linux_test.go +++ b/go/vt/servenv/metrics_linux_test.go @@ -21,22 +21,17 @@ package servenv import ( "testing" - "time" ) -func TestGetCpu(t *testing.T) { - time.Sleep(2 * time.Second) +func TestGetCpuMetrics(t *testing.T) { + sleepBeforeCpuSample() cpuUsage, err := getHostCpuUsage() - if err != nil || cpuUsage < 0 { - t.Errorf("Error reading CPU: %v, value %.2f", err, cpuUsage) - } + validateCpu(t, cpuUsage, err) t.Logf("CPU Utilization is %.2f", cpuUsage) } -func TestGetMemory(t *testing.T) { +func TestGetMemoryMetrics(t *testing.T) { memoryUsage, err := getHostMemoryUsage() - if err != nil || memoryUsage < 0 || memoryUsage > 1 { - t.Errorf("Error reading memory: %v, value %.2f", err, memoryUsage) - } + validateMem(t, memoryUsage, err) t.Logf("Memory Utilization is %.2f", memoryUsage) } diff --git a/go/vt/servenv/metrics_nonlinux_test.go b/go/vt/servenv/metrics_nonlinux_test.go index 508e32ddab1..9a5bee6f8d0 100644 --- a/go/vt/servenv/metrics_nonlinux_test.go +++ b/go/vt/servenv/metrics_nonlinux_test.go @@ -23,18 +23,15 @@ import ( "testing" ) -func TestGetCpu(t *testing.T) { +func TestGetCpuMetrics(t *testing.T) { + sleepBeforeCpuSample() cpuUsage, err := getHostCpuUsage() - if err != nil || cpuUsage < 0 { - t.Errorf("Error reading CPU: %v, value %.2f", err, cpuUsage) - } + validateCpu(t, cpuUsage, err) t.Logf("CPU Utilization is %.2f", cpuUsage) } -func TestGetMemory(t *testing.T) { +func TestGetMemoryMetrics(t *testing.T) { memoryUsage, err := getHostMemoryUsage() - if err != nil || memoryUsage < 0 || memoryUsage > 1 { - t.Errorf("Error reading memory: %v, value %.2f", err, memoryUsage) - } + validateMem(t, memoryUsage, err) t.Logf("Memory Utilization is %.2f", memoryUsage) } diff --git a/go/vt/servenv/metrics_test.go b/go/vt/servenv/metrics_test.go index 79dc742b23d..605d4bc6179 100644 --- a/go/vt/servenv/metrics_test.go +++ b/go/vt/servenv/metrics_test.go @@ -17,21 +17,42 @@ limitations under the License. package servenv import ( + "runtime" "testing" + "time" ) -func TestGetCpuUsage(t *testing.T) { +func sleepBeforeCpuSample() { + time.Sleep(750 * time.Millisecond) +} + +func validateCpu(t *testing.T, cpu float64, err error) { + if err != nil { + t.Errorf("Error reading CPU: %v, value %.10f", err, cpu) + } + if cpu <= 0 || cpu > float64(runtime.NumCPU()) { + t.Errorf("CPU value out of range %5.f", cpu) + } +} + +func validateMem(t *testing.T, mem float64, err error) { + if err != nil { + t.Errorf("Error reading memory: %v, value %.10f", err, mem) + } + if mem <= 0 || mem > 1 { + t.Errorf("Mem value out of range %5.f", mem) + } +} + +func TestGetCpuUsageMetrics(t *testing.T) { + sleepBeforeCpuSample() value := getCpuUsage() t.Logf("CPU usage %v", value) - if value < 0 || value > 1 { - t.Errorf("Got invalid CPU value %v", value) - } + validateCpu(t, value, nil) } -func TestGetMemoryUsage(t *testing.T) { +func TestGetMemoryUsageMetrics(t *testing.T) { value := getMemoryUsage() t.Logf("Memory usage %v", value) - if value < 0 || value > 1 { - t.Errorf("Got invalid memory value %v", value) - } + validateMem(t, value, nil) } From c952cfa734097388baef52cb3b4bb96ed8fa0e6a Mon Sep 17 00:00:00 2001 From: twthorn Date: Thu, 5 Jun 2025 12:50:04 -0700 Subject: [PATCH 12/16] Disambiguate orca.Register function Signed-off-by: twthorn --- go/vt/servenv/grpc_server.go | 4 ++-- go/vt/servenv/grpc_server_test.go | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/go/vt/servenv/grpc_server.go b/go/vt/servenv/grpc_server.go index 604daa88e3d..44334fb1a1f 100644 --- a/go/vt/servenv/grpc_server.go +++ b/go/vt/servenv/grpc_server.go @@ -114,7 +114,7 @@ var ( // Injectable behavior for testing. var ( - registerORCA = orca.Register + orcaRegisterFunc = orca.Register ) // TLS variables. @@ -349,7 +349,7 @@ func serveGRPC() { } func registerOrca() { - if err := registerORCA(GRPCServer, orca.ServiceOptions{ + if err := orcaRegisterFunc(GRPCServer, orca.ServiceOptions{ // The minimum interval of orca is 30 seconds, unless we enable a testing flag. MinReportingInterval: 30 * time.Second, ServerMetricsProvider: GRPCServerMetricsRecorder, diff --git a/go/vt/servenv/grpc_server_test.go b/go/vt/servenv/grpc_server_test.go index a93a7a30b3e..7ad4d33fbf2 100644 --- a/go/vt/servenv/grpc_server_test.go +++ b/go/vt/servenv/grpc_server_test.go @@ -94,11 +94,11 @@ func TestEnableOrca(t *testing.T) { } called := false - registerORCA = func(s grpc.ServiceRegistrar, o orca.ServiceOptions) error { + orcaRegisterFunc = func(s grpc.ServiceRegistrar, o orca.ServiceOptions) error { called = true return nil } - defer func() { registerORCA = orca.Register }() + defer func() { orcaRegisterFunc = orca.Register }() serveGRPC() if !called { t.Errorf("registerORCA should have been called when ORCA metrics are enabled") @@ -117,11 +117,11 @@ func TestDisableOrca(t *testing.T) { } called := false - registerORCA = func(s grpc.ServiceRegistrar, o orca.ServiceOptions) error { + orcaRegisterFunc = func(s grpc.ServiceRegistrar, o orca.ServiceOptions) error { called = true return nil } - defer func() { registerORCA = orca.Register }() + defer func() { orcaRegisterFunc = orca.Register }() serveGRPC() if called { t.Errorf("registerORCA should NOT have been called when ORCA metrics are enabled") From 8bcd7b23c82a5e67a6de9b35894d7fe78f38cddc Mon Sep 17 00:00:00 2001 From: twthorn Date: Fri, 13 Jun 2025 14:25:29 -0700 Subject: [PATCH 13/16] Remove docker cgroup metrics test, remove redundant host metrics calls Signed-off-by: twthorn --- docker/test/Dockerfile.metrics | 17 ------- .../metrics/docker_metrics_cgroup_test.go | 49 ------------------- go/vt/servenv/metrics.go | 4 +- go/vt/servenv/metrics_linux.go | 8 --- go/vt/servenv/metrics_nonlinux.go | 8 --- 5 files changed, 2 insertions(+), 84 deletions(-) delete mode 100644 docker/test/Dockerfile.metrics delete mode 100644 go/test/endtoend/metrics/docker_metrics_cgroup_test.go diff --git a/docker/test/Dockerfile.metrics b/docker/test/Dockerfile.metrics deleted file mode 100644 index c655ec59885..00000000000 --- a/docker/test/Dockerfile.metrics +++ /dev/null @@ -1,17 +0,0 @@ -# Use the official Go image as a base image -FROM golang:1.24.3 - -# Set the working directory inside the container -WORKDIR /app - -# Copy the go mod and sum files -COPY go.mod go.sum ./ - -# Download all dependencies -RUN go mod download - -# Copy the source from the current directory to the working directory inside the container -COPY . . - -# Command to run the tests -CMD ["go", "test", "-count=1", "-timeout=60s", "-run", "Metrics", "vitess.io/vitess/go/vt/servenv", "-v"] diff --git a/go/test/endtoend/metrics/docker_metrics_cgroup_test.go b/go/test/endtoend/metrics/docker_metrics_cgroup_test.go deleted file mode 100644 index bb52183b84a..00000000000 --- a/go/test/endtoend/metrics/docker_metrics_cgroup_test.go +++ /dev/null @@ -1,49 +0,0 @@ -/* -Copyright 2025 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package servenv - -import ( - "fmt" - "os" - "os/exec" - "testing" - - "github.com/stretchr/testify/require" -) - -func TestMain(m *testing.M) { - fmt.Println("Building test Docker image...") - build := exec.Command("docker", "build", "-f", "docker/test/Dockerfile.metrics", "-t", "metrics:test", ".") - build.Dir = "../../../.." - build.Stdout = os.Stdout - build.Stderr = os.Stderr - if err := build.Run(); err != nil { - fmt.Fprintf(os.Stderr, "Failed to build Docker image: %v\n", err) - os.Exit(1) - } - - os.Exit(m.Run()) -} - -func TestReportCpuInDocker(t *testing.T) { - cmd := exec.Command("docker", "run", "--rm", "metrics:test") - cmd.Stdout = os.Stdout - cmd.Stderr = os.Stderr - - err := cmd.Run() - require.NoError(t, err, "Docker test container failed") -} diff --git a/go/vt/servenv/metrics.go b/go/vt/servenv/metrics.go index 79e39b1f45f..a2b25dc8fc5 100644 --- a/go/vt/servenv/metrics.go +++ b/go/vt/servenv/metrics.go @@ -21,7 +21,7 @@ func getCpuUsage() float64 { if err == nil { return value } - value, err = getHostCpu() + value, err = getHostCpuUsage() if err == nil { return value } @@ -33,7 +33,7 @@ func getMemoryUsage() float64 { if err == nil { return value } - value, err = getHostMemory() + value, err = getHostMemoryUsage() if err == nil { return value } diff --git a/go/vt/servenv/metrics_linux.go b/go/vt/servenv/metrics_linux.go index 68dbd24b423..c0f0d59c23f 100644 --- a/go/vt/servenv/metrics_linux.go +++ b/go/vt/servenv/metrics_linux.go @@ -19,14 +19,6 @@ limitations under the License. package servenv -func getHostCpu() (float64, error) { - return getHostCpuUsage() -} - -func getHostMemory() (float64, error) { - return getHostMemoryUsage() -} - func getCgroupCpu() (float64, error) { return getCgroupCpuUsage() } diff --git a/go/vt/servenv/metrics_nonlinux.go b/go/vt/servenv/metrics_nonlinux.go index 07a9fbe515d..53d854f8c94 100644 --- a/go/vt/servenv/metrics_nonlinux.go +++ b/go/vt/servenv/metrics_nonlinux.go @@ -23,14 +23,6 @@ import ( "fmt" ) -func getHostCpu() (float64, error) { - return getHostCpuUsage() -} - -func getHostMemory() (float64, error) { - return getHostMemoryUsage() -} - func getCgroupCpu() (float64, error) { return -1, fmt.Errorf("Cgroup not supported on nonlinux platform") } From 4d56f58a546375057b8846c951da3dd1e9534c63 Mon Sep 17 00:00:00 2001 From: twthorn Date: Thu, 19 Jun 2025 16:56:55 -0700 Subject: [PATCH 14/16] Remove flakey tests operating on global state Signed-off-by: twthorn --- go/vt/servenv/grpc_server_test.go | 48 ------------------------------- 1 file changed, 48 deletions(-) diff --git a/go/vt/servenv/grpc_server_test.go b/go/vt/servenv/grpc_server_test.go index 7ad4d33fbf2..412b728e8b9 100644 --- a/go/vt/servenv/grpc_server_test.go +++ b/go/vt/servenv/grpc_server_test.go @@ -80,54 +80,6 @@ func TestOrcaRecorder(t *testing.T) { } } -func TestEnableOrca(t *testing.T) { - // Set the port to enable gRPC server. - withTempVar(&gRPCPort, getFreePort()) - withTempVar(&gRPCEnableOrcaMetrics, true) - withTempVar(&GRPCServerMetricsRecorder, nil) - - gRPCEnableOrcaMetrics = true - GRPCServerMetricsRecorder = nil - createGRPCServer() - if GRPCServerMetricsRecorder == nil { - t.Errorf("GRPCServerMetricsRecorder should be initialized when gRPCEnableOrcaMetrics is true") - } - - called := false - orcaRegisterFunc = func(s grpc.ServiceRegistrar, o orca.ServiceOptions) error { - called = true - return nil - } - defer func() { orcaRegisterFunc = orca.Register }() - serveGRPC() - if !called { - t.Errorf("registerORCA should have been called when ORCA metrics are enabled") - } -} - -func TestDisableOrca(t *testing.T) { - // Set the port to enable gRPC server. - withTempVar(&gRPCPort, getFreePort()) - withTempVar(&gRPCEnableOrcaMetrics, false) - withTempVar(&GRPCServerMetricsRecorder, nil) - - createGRPCServer() - if GRPCServerMetricsRecorder != nil { - t.Errorf("GRPCServerMetricsRecorder should NOT be initialized when gRPCEnableOrcaMetrics is false") - } - - called := false - orcaRegisterFunc = func(s grpc.ServiceRegistrar, o orca.ServiceOptions) error { - called = true - return nil - } - defer func() { orcaRegisterFunc = orca.Register }() - serveGRPC() - if called { - t.Errorf("registerORCA should NOT have been called when ORCA metrics are enabled") - } -} - func TestReportedOrca(t *testing.T) { // Set the port to enable gRPC server. withTempVar(&gRPCPort, getFreePort()) From cb708d7cc6ae7589dbb3adace40c9d8fa9c0de9a Mon Sep 17 00:00:00 2001 From: twthorn Date: Fri, 20 Jun 2025 13:39:51 -0700 Subject: [PATCH 15/16] Use separate cells for metrics Signed-off-by: twthorn --- go/vt/vtgate/vstream_manager_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/go/vt/vtgate/vstream_manager_test.go b/go/vt/vtgate/vstream_manager_test.go index 82dda676eb1..ed1eca689ea 100644 --- a/go/vt/vtgate/vstream_manager_test.go +++ b/go/vt/vtgate/vstream_manager_test.go @@ -526,7 +526,7 @@ func TestVStreamMulti(t *testing.T) { func TestVStreamsMetrics(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - cell := "aa" + cell := "ab" ks := "TestVStream" _ = createSandbox(ks) hc := discovery.NewFakeHealthCheck(nil) @@ -616,7 +616,7 @@ func waitForMetricsMatch(t *testing.T, getActual func() map[string]int64, want m func TestVStreamsMetricsErrors(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - cell := "aa" + cell := "ac" ks := "TestVStream" _ = createSandbox(ks) hc := discovery.NewFakeHealthCheck(nil) From a7203a7fbabda39bad52f4e1e1c3745e27111842 Mon Sep 17 00:00:00 2001 From: twthorn Date: Thu, 26 Jun 2025 10:52:33 -0700 Subject: [PATCH 16/16] Add comment documenting why we use unique cells for vstream metrics test Signed-off-by: twthorn --- go/vt/vtgate/vstream_manager_test.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/go/vt/vtgate/vstream_manager_test.go b/go/vt/vtgate/vstream_manager_test.go index ed1eca689ea..f5af68806cb 100644 --- a/go/vt/vtgate/vstream_manager_test.go +++ b/go/vt/vtgate/vstream_manager_test.go @@ -526,6 +526,7 @@ func TestVStreamMulti(t *testing.T) { func TestVStreamsMetrics(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() + // Use a unique cell to avoid parallel tests interfering with each other's metrics cell := "ab" ks := "TestVStream" _ = createSandbox(ks) @@ -616,6 +617,7 @@ func waitForMetricsMatch(t *testing.T, getActual func() map[string]int64, want m func TestVStreamsMetricsErrors(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() + // Use a unique cell to avoid parallel tests interfering with each other's metrics cell := "ac" ks := "TestVStream" _ = createSandbox(ks)