From aeda7e3a21df6b195859108a2581c36f0bbb1284 Mon Sep 17 00:00:00 2001 From: shawnh2 Date: Sat, 22 Mar 2025 11:02:28 +0800 Subject: [PATCH 1/5] sample prom metrics and pprof profiles Signed-off-by: shawnh2 --- test/benchmark/suite/render.go | 162 ++++++++++++++++----------------- test/benchmark/suite/report.go | 105 +++++++++++---------- test/benchmark/suite/suite.go | 29 +++--- 3 files changed, 146 insertions(+), 150 deletions(-) diff --git a/test/benchmark/suite/render.go b/test/benchmark/suite/render.go index 413d02502d..df1393aea5 100644 --- a/test/benchmark/suite/render.go +++ b/test/benchmark/suite/render.go @@ -11,59 +11,18 @@ import ( "bytes" "fmt" "io" + "math" "os" - "strconv" + "path" + "sort" "strings" "text/tabwriter" ) const ( - omitEmptyValue = "-" benchmarkEnvPrefix = "BENCHMARK_" - - querySum = "Sum" - queryAvg = "Avg" - queryMin = "Min" - queryMax = "Max" ) -type tableHeader struct { - name string - unit string - promQL string // only valid for metrics table - queryType string -} - -var metricsTableHeader = []tableHeader{ - { - name: "Test Name", - }, - { - name: "Envoy Gateway Memory", - unit: "MiB", - promQL: `process_resident_memory_bytes{namespace="envoy-gateway-system", control_plane="envoy-gateway"}/1024/1024`, - queryType: querySum, - }, - { - name: "Envoy Gateway CPU", - unit: "s", - promQL: `process_cpu_seconds_total{namespace="envoy-gateway-system", control_plane="envoy-gateway"}`, - queryType: querySum, - }, - { - name: "Envoy Proxy Memory (Avg)", - unit: "MiB", - promQL: `container_memory_working_set_bytes{namespace="envoy-gateway-system",container="envoy"}/1024/1024`, - queryType: queryAvg, - }, - { - name: "Envoy Proxy CPU (Avg)", - unit: "s", - promQL: `container_cpu_usage_seconds_total{namespace="envoy-gateway-system",container="envoy"}`, - queryType: queryAvg, - }, -} - // RenderReport renders a report out of given list of benchmark report in Markdown format. func RenderReport(writer io.Writer, name, description string, titleLevel int, reports []*BenchmarkReport) error { writeSection(writer, "Test: "+name, titleLevel, description) @@ -77,7 +36,7 @@ func RenderReport(writer io.Writer, name, description string, titleLevel int, re renderMetricsTable(writer, reports) writeSection(writer, "Profiles", titleLevel+1, renderProfilesNote()) - renderProfilesTable(writer, "Memory", "heap", titleLevel+2, reports) + renderProfilesTable(writer, "Heap/Memory", "heap", titleLevel+2, reports) return nil } @@ -90,22 +49,15 @@ func newMarkdownStyleTableWriter(writer io.Writer) *tabwriter.Writer { func renderEnvSettingsTable(writer io.Writer) { table := newMarkdownStyleTableWriter(writer) - headers := []tableHeader{ - {name: "RPS"}, - {name: "Connections"}, - {name: "Duration", unit: "s"}, - {name: "CPU Limits", unit: "m"}, - {name: "Memory Limits", unit: "MiB"}, + headers := []string{ + "RPS", + "Connections", + "Duration (Seconds)", + "CPU Limits (m)", + "Memory Limits (MiB)", } writeTableHeader(table, headers) - - writeTableRow(table, headers, func(_ int, h tableHeader) string { - env := strings.ReplaceAll(strings.ToUpper(h.name), " ", "_") - if v, ok := os.LookupEnv(benchmarkEnvPrefix + env); ok { - return v - } - return omitEmptyValue - }) + writeTableRow(table, headers) _ = table.Flush() } @@ -129,20 +81,20 @@ func renderResultsTable(writer io.Writer, reports []*BenchmarkReport) error { func renderMetricsTable(writer io.Writer, reports []*BenchmarkReport) { table := newMarkdownStyleTableWriter(writer) - writeTableHeader(table, metricsTableHeader) + // write headers + headers := []string{ + "Test Name", + "Envoy Gateway Memory (MiB)\nmin/max/means", + "Envoy Gateway CPU (Seconds)\nmin/max/means", + "Averaged Envoy Proxy Memory (MiB)\nmin/max/means", + "Averaged Envoy Proxy CPU (Seconds)\nmin/max/means", + } + writeTableHeader(table, headers) for _, report := range reports { - writeTableRow(table, metricsTableHeader, func(_ int, h tableHeader) string { - if len(h.promQL) == 0 { - return report.Name - } - - if v, ok := report.Metrics[h.name]; ok { - return strconv.FormatFloat(v, 'f', -1, 64) - } - - return omitEmptyValue - }) + data := []string{report.Name} + data = append(data, getSamplesMinMaxMeans(report.Samples)...) + writeTableRow(table, data) } _ = table.Flush() @@ -156,19 +108,32 @@ You can visualize them in a web page by running: %s Currently, the supported profile types are: -- heap +- heap (memory) `, "`/profiles`", "`{ProfileType}.{TestCase}.pprof`", "```shell\ngo tool pprof -http=: path/to/your.pprof\n```") } func renderProfilesTable(writer io.Writer, target, key string, titleLevel int, reports []*BenchmarkReport) { - writeSection(writer, target, titleLevel, "") + writeSection(writer, target, titleLevel, + "The profiles were sampled when Envoy Gateway Memory is at its maximum.") for _, report := range reports { + // Get the heap profile when control plane memory is at its maximum. + sortedSamples := make([]BenchmarkMetricSample, len(report.Samples)) + copy(sortedSamples, report.Samples) + sort.Slice(sortedSamples, func(i, j int) bool { + return sortedSamples[i].ControlPlaneMem > sortedSamples[j].ControlPlaneMem + }) + + heapPprof := sortedSamples[0].HeapProfile + heapPprofPath := path.Join(report.ProfilesOutputDir, fmt.Sprintf("heap.%s.pprof", report.Name)) + _ = os.WriteFile(heapPprofPath, heapPprof, 0o600) + // The image is not be rendered yet, so it is a placeholder for the path. // The image will be rendered after the test has finished. + rootDir := strings.SplitN(heapPprofPath, "/", 2)[0] + heapPprofPath = strings.TrimPrefix(heapPprofPath, rootDir+"/") writeSection(writer, report.Name, titleLevel+1, - fmt.Sprintf("![%s-%s](%s.png)", key, report.Name, - strings.TrimSuffix(report.ProfilesPath[key], ".pprof"))) + fmt.Sprintf("![%s-%s](%s.png)", key, report.Name, strings.TrimSuffix(heapPprofPath, ".pprof"))) } } @@ -194,21 +159,16 @@ func writeCollapsibleSection(writer io.Writer, title string, content []byte) { `, title, summary) } -func writeTableHeader(table *tabwriter.Writer, headers []tableHeader) { - writeTableRow(table, headers, func(_ int, h tableHeader) string { - if len(h.unit) > 0 { - return fmt.Sprintf("%s (%s)", h.name, h.unit) - } - return h.name - }) +func writeTableHeader(table *tabwriter.Writer, headers []string) { + writeTableRow(table, headers) writeTableDelimiter(table, len(headers)) } -// writeTableRow writes one row in Markdown table style according to headers. -func writeTableRow(table *tabwriter.Writer, headers []tableHeader, on func(int, tableHeader) string) { +// writeTableRow writes one row in Markdown table style. +func writeTableRow(table *tabwriter.Writer, data []string) { row := "|" - for i, v := range headers { - row += on(i, v) + "\t" + for _, v := range data { + row += v + "\t" } _, _ = fmt.Fprintln(table, row) @@ -223,3 +183,33 @@ func writeTableDelimiter(table *tabwriter.Writer, n int) { _, _ = fmt.Fprintln(table, sep) } + +func getSamplesMinMaxMeans(samples []BenchmarkMetricSample) []string { + cpMem := make([]float64, 0, len(samples)) + cpCpu := make([]float64, 0, len(samples)) + dpMem := make([]float64, 0, len(samples)) + dpCpu := make([]float64, 0, len(samples)) + for _, sample := range samples { + cpMem = append(cpMem, sample.ControlPlaneMem) + cpCpu = append(cpCpu, sample.ControlPlaneCpu) + dpMem = append(dpMem, sample.DataPlaneMem) + dpCpu = append(dpCpu, sample.DataPlaneCpu) + } + + return []string{ + getMetricsMinMaxMeans(cpMem), + getMetricsMinMaxMeans(cpCpu), + getMetricsMinMaxMeans(dpMem), + getMetricsMinMaxMeans(dpCpu), + } +} + +func getMetricsMinMaxMeans(metrics []float64) string { + var min, max, sum float64 = metrics[0], 0, 0 + for _, v := range metrics { + min = math.Min(v, min) + max = math.Max(v, max) + sum += v + } + return fmt.Sprintf("%.2f / %.2f / %.2f", min, max, sum/float64(len(metrics))) +} diff --git a/test/benchmark/suite/report.go b/test/benchmark/suite/report.go index b159e79860..83a41c3697 100644 --- a/test/benchmark/suite/report.go +++ b/test/benchmark/suite/report.go @@ -12,10 +12,6 @@ import ( "context" "fmt" "io" - "os" - "path" - "strconv" - "strings" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -26,45 +22,56 @@ import ( prom "github.com/envoyproxy/gateway/test/utils/prometheus" ) +const ( + controlPlaneMemQL = `process_resident_memory_bytes{namespace="envoy-gateway-system", control_plane="envoy-gateway"}/1024/1024` + controlPlaneCpuQL = `rate(process_cpu_seconds_total{namespace="envoy-gateway-system", control_plane="envoy-gateway"}[3s])` + dataPlaneMemQL = `container_memory_working_set_bytes{namespace="envoy-gateway-system",container="envoy"}/1024/1024` + dataPlaneCpuQL = `rate(container_cpu_usage_seconds_total{namespace="envoy-gateway-system",container="envoy"}[3s])` +) + +// BenchmarkMetricSample contains sampled metrics and profiles data. +type BenchmarkMetricSample struct { + ControlPlaneMem float64 + ControlPlaneCpu float64 + DataPlaneMem float64 + DataPlaneCpu float64 + + HeapProfile []byte +} + type BenchmarkReport struct { Name string - Result []byte - Metrics map[string]float64 // metricTableHeaderName:metricValue - ProfilesPath map[string]string // profileKey:profileFilepath ProfilesOutputDir string + // Nighthawk benchmark result + Result []byte + // Prometheus metrics and pprof profiles sampled data + Samples []BenchmarkMetricSample kubeClient kube.CLIClient promClient *prom.Client } -func NewBenchmarkReport(name, profilesOutputDir string, kubeClient kube.CLIClient, promClient *prom.Client) (*BenchmarkReport, error) { - if err := createDirIfNotExist(profilesOutputDir); err != nil { - return nil, err - } - +func NewBenchmarkReport(name, profilesOutputDir string, kubeClient kube.CLIClient, promClient *prom.Client) *BenchmarkReport { return &BenchmarkReport{ Name: name, - Metrics: make(map[string]float64), - ProfilesPath: make(map[string]string), ProfilesOutputDir: profilesOutputDir, kubeClient: kubeClient, promClient: promClient, - }, nil + } } -func (r *BenchmarkReport) Collect(ctx context.Context, job *types.NamespacedName) error { - if err := r.GetProfiles(ctx); err != nil { - return err - } +func (r *BenchmarkReport) Sample(ctx context.Context) error { + sample := BenchmarkMetricSample{} - if err := r.GetMetrics(ctx); err != nil { + if err := r.sampleProfiles(ctx, &sample); err != nil { return err } - if err := r.GetResult(ctx, job); err != nil { + if err := r.sampleMetrics(ctx, &sample); err != nil { return err } + r.Samples = append(r.Samples, sample) return nil } @@ -97,34 +104,35 @@ func (r *BenchmarkReport) GetResult(ctx context.Context, job *types.NamespacedNa return nil } -func (r *BenchmarkReport) GetMetrics(ctx context.Context) error { - for _, h := range metricsTableHeader { - if len(h.promQL) == 0 { - continue - } - - var ( - v float64 - err error - ) - switch h.queryType { - case querySum: - v, err = r.promClient.QuerySum(ctx, h.promQL) - case queryAvg: - v, err = r.promClient.QueryAvg(ctx, h.promQL) - default: - return fmt.Errorf("unsupported query type: %s", h.queryType) - } +func (r *BenchmarkReport) sampleMetrics(ctx context.Context, sample *BenchmarkMetricSample) error { + // Sample memory + cpMem, err := r.promClient.QuerySum(ctx, controlPlaneMemQL) + if err != nil { + return fmt.Errorf("failed to query control plane memory: %w", err) + } + dpMem, err := r.promClient.QueryAvg(ctx, dataPlaneMemQL) + if err != nil { + return fmt.Errorf("failed to query data plane memory: %w", err) + } - if err == nil { - r.Metrics[h.name], _ = strconv.ParseFloat(fmt.Sprintf("%.2f", v), 64) - } + // Sample cpu + cpCpu, err := r.promClient.QuerySum(ctx, controlPlaneCpuQL) + if err != nil { + return fmt.Errorf("failed to query control plane cpu: %w", err) + } + dpCpu, err := r.promClient.QueryAvg(ctx, dataPlaneCpuQL) + if err != nil { + return fmt.Errorf("failed to query data plane memory: %w", err) } + sample.ControlPlaneMem = cpMem + sample.ControlPlaneCpu = cpCpu + sample.DataPlaneMem = dpMem + sample.DataPlaneCpu = dpCpu return nil } -func (r *BenchmarkReport) GetProfiles(ctx context.Context) error { +func (r *BenchmarkReport) sampleProfiles(ctx context.Context, sample *BenchmarkMetricSample) error { egPod, err := r.fetchEnvoyGatewayPod(ctx) if err != nil { return err @@ -138,16 +146,7 @@ func (r *BenchmarkReport) GetProfiles(ctx context.Context) error { return err } - heapProfPath := path.Join(r.ProfilesOutputDir, fmt.Sprintf("heap.%s.pprof", r.Name)) - if err = os.WriteFile(heapProfPath, heapProf, 0o600); err != nil { - return fmt.Errorf("failed to write profiles %s: %w", heapProfPath, err) - } - - // Remove parent output report dir. - splits := strings.SplitN(heapProfPath, "/", 2)[0] - heapProfPath = strings.TrimPrefix(heapProfPath, splits+"/") - r.ProfilesPath["heap"] = heapProfPath - + sample.HeapProfile = heapProf return nil } diff --git a/test/benchmark/suite/suite.go b/test/benchmark/suite/suite.go index 80a6ff1daf..53807151e0 100644 --- a/test/benchmark/suite/suite.go +++ b/test/benchmark/suite/suite.go @@ -32,9 +32,10 @@ import ( ) const ( - BenchmarkTestScaledKey = "benchmark-test/scaled" - BenchmarkTestClientKey = "benchmark-test/client" - DefaultControllerName = "gateway.envoyproxy.io/gatewayclass-controller" + BenchmarkTestScaledKey = "benchmark-test/scaled" + BenchmarkTestClientKey = "benchmark-test/client" + BenchmarkMetricsSampleTick = 3 * time.Second + DefaultControllerName = "gateway.envoyproxy.io/gatewayclass-controller" ) type BenchmarkTest struct { @@ -200,8 +201,14 @@ func (b *BenchmarkTestSuite) Benchmark(t *testing.T, ctx context.Context, jobNam return nil, err } + profilesOutputDir := path.Join(b.ReportSaveDir, "profiles") + if err := createDirIfNotExist(profilesOutputDir); err != nil { + return nil, err + } + // Wait from benchmark test job to complete. - if err = wait.PollUntilContextTimeout(ctx, 6*time.Second, time.Duration(duration*10)*time.Second, true, func(ctx context.Context) (bool, error) { + report := NewBenchmarkReport(resultTitle, profilesOutputDir, b.kubeClient, b.promClient) + if err = wait.PollUntilContextTimeout(ctx, BenchmarkMetricsSampleTick, time.Duration(duration*10)*time.Second, true, func(ctx context.Context) (bool, error) { job := new(batchv1.Job) if err = b.Client.Get(ctx, *jobNN, job); err != nil { return false, err @@ -221,6 +228,11 @@ func (b *BenchmarkTestSuite) Benchmark(t *testing.T, ctx context.Context, jobNam t.Logf("Job %s still not complete", jobName) + // Sample the metrics and profiles at runtime. + if err := report.Sample(ctx); err != nil { + t.Errorf("Failed to sample metrics and profiles: %v", err) + } + return false, nil }); err != nil { t.Errorf("Failed to run benchmark test: %v", err) @@ -230,13 +242,8 @@ func (b *BenchmarkTestSuite) Benchmark(t *testing.T, ctx context.Context, jobNam t.Logf("Running benchmark test: %s successfully", resultTitle) - report, err := NewBenchmarkReport(resultTitle, path.Join(b.ReportSaveDir, "profiles"), b.kubeClient, b.promClient) - if err != nil { - return nil, fmt.Errorf("failed to create benchmark report: %w", err) - } - - // Get all the reports from this benchmark test run. - if err = report.Collect(ctx, jobNN); err != nil { + // Get nighthaw result from this benchmark test run. + if err = report.GetResult(ctx, jobNN); err != nil { return nil, err } From 60b41ba458d8263f20c12bb6432730ef334f9b52 Mon Sep 17 00:00:00 2001 From: shawnh2 Date: Sat, 22 Mar 2025 15:26:50 +0800 Subject: [PATCH 2/5] update env table data and promql Signed-off-by: shawnh2 --- test/benchmark/suite/render.go | 26 +++++++++++++++----------- test/benchmark/suite/report.go | 16 ++++++++-------- test/benchmark/suite/suite.go | 2 +- 3 files changed, 24 insertions(+), 20 deletions(-) diff --git a/test/benchmark/suite/render.go b/test/benchmark/suite/render.go index df1393aea5..46b96e655f 100644 --- a/test/benchmark/suite/render.go +++ b/test/benchmark/suite/render.go @@ -19,10 +19,6 @@ import ( "text/tabwriter" ) -const ( - benchmarkEnvPrefix = "BENCHMARK_" -) - // RenderReport renders a report out of given list of benchmark report in Markdown format. func RenderReport(writer io.Writer, name, description string, titleLevel int, reports []*BenchmarkReport) error { writeSection(writer, "Test: "+name, titleLevel, description) @@ -57,7 +53,15 @@ func renderEnvSettingsTable(writer io.Writer) { "Memory Limits (MiB)", } writeTableHeader(table, headers) - writeTableRow(table, headers) + + data := []string{ + os.Getenv("BENCHMARK_RPS"), + os.Getenv("BENCHMARK_CONNECTIONS"), + os.Getenv("BENCHMARK_DURATION"), + os.Getenv("BENCHMARK_CPU_LIMITS"), + os.Getenv("BENCHMARK_MEMORY_LIMITS"), + } + writeTableRow(table, data) _ = table.Flush() } @@ -186,21 +190,21 @@ func writeTableDelimiter(table *tabwriter.Writer, n int) { func getSamplesMinMaxMeans(samples []BenchmarkMetricSample) []string { cpMem := make([]float64, 0, len(samples)) - cpCpu := make([]float64, 0, len(samples)) + cpCPU := make([]float64, 0, len(samples)) dpMem := make([]float64, 0, len(samples)) - dpCpu := make([]float64, 0, len(samples)) + dpCPU := make([]float64, 0, len(samples)) for _, sample := range samples { cpMem = append(cpMem, sample.ControlPlaneMem) - cpCpu = append(cpCpu, sample.ControlPlaneCpu) + cpCPU = append(cpCPU, sample.ControlPlaneCPU) dpMem = append(dpMem, sample.DataPlaneMem) - dpCpu = append(dpCpu, sample.DataPlaneCpu) + dpCPU = append(dpCPU, sample.DataPlaneCPU) } return []string{ getMetricsMinMaxMeans(cpMem), - getMetricsMinMaxMeans(cpCpu), + getMetricsMinMaxMeans(cpCPU), getMetricsMinMaxMeans(dpMem), - getMetricsMinMaxMeans(dpCpu), + getMetricsMinMaxMeans(dpCPU), } } diff --git a/test/benchmark/suite/report.go b/test/benchmark/suite/report.go index 83a41c3697..9adf14ee0c 100644 --- a/test/benchmark/suite/report.go +++ b/test/benchmark/suite/report.go @@ -24,17 +24,17 @@ import ( const ( controlPlaneMemQL = `process_resident_memory_bytes{namespace="envoy-gateway-system", control_plane="envoy-gateway"}/1024/1024` - controlPlaneCpuQL = `rate(process_cpu_seconds_total{namespace="envoy-gateway-system", control_plane="envoy-gateway"}[3s])` + controlPlaneCPUQL = `rate(process_cpu_seconds_total{namespace="envoy-gateway-system", control_plane="envoy-gateway"}[1m])` dataPlaneMemQL = `container_memory_working_set_bytes{namespace="envoy-gateway-system",container="envoy"}/1024/1024` - dataPlaneCpuQL = `rate(container_cpu_usage_seconds_total{namespace="envoy-gateway-system",container="envoy"}[3s])` + dataPlaneCPUQL = `rate(container_cpu_usage_seconds_total{namespace="envoy-gateway-system",container="envoy"}[1m])` ) // BenchmarkMetricSample contains sampled metrics and profiles data. type BenchmarkMetricSample struct { ControlPlaneMem float64 - ControlPlaneCpu float64 + ControlPlaneCPU float64 DataPlaneMem float64 - DataPlaneCpu float64 + DataPlaneCPU float64 HeapProfile []byte } @@ -116,19 +116,19 @@ func (r *BenchmarkReport) sampleMetrics(ctx context.Context, sample *BenchmarkMe } // Sample cpu - cpCpu, err := r.promClient.QuerySum(ctx, controlPlaneCpuQL) + cpCPU, err := r.promClient.QuerySum(ctx, controlPlaneCPUQL) if err != nil { return fmt.Errorf("failed to query control plane cpu: %w", err) } - dpCpu, err := r.promClient.QueryAvg(ctx, dataPlaneCpuQL) + dpCPU, err := r.promClient.QueryAvg(ctx, dataPlaneCPUQL) if err != nil { return fmt.Errorf("failed to query data plane memory: %w", err) } sample.ControlPlaneMem = cpMem - sample.ControlPlaneCpu = cpCpu + sample.ControlPlaneCPU = cpCPU sample.DataPlaneMem = dpMem - sample.DataPlaneCpu = dpCpu + sample.DataPlaneCPU = dpCPU return nil } diff --git a/test/benchmark/suite/suite.go b/test/benchmark/suite/suite.go index 53807151e0..ba3b020c49 100644 --- a/test/benchmark/suite/suite.go +++ b/test/benchmark/suite/suite.go @@ -230,7 +230,7 @@ func (b *BenchmarkTestSuite) Benchmark(t *testing.T, ctx context.Context, jobNam // Sample the metrics and profiles at runtime. if err := report.Sample(ctx); err != nil { - t.Errorf("Failed to sample metrics and profiles: %v", err) + t.Errorf("Failed to sample metrics or profiles: %v", err) } return false, nil From 199283ff6c979dac495cd23734c314a6be9f318b Mon Sep 17 00:00:00 2001 From: shawnh2 Date: Sat, 22 Mar 2025 17:14:25 +0800 Subject: [PATCH 3/5] enhance metrics value process Signed-off-by: shawnh2 --- test/benchmark/suite/render.go | 17 +++++++++---- test/benchmark/suite/report.go | 46 +++++++++++++++++----------------- test/benchmark/suite/suite.go | 3 ++- 3 files changed, 37 insertions(+), 29 deletions(-) diff --git a/test/benchmark/suite/render.go b/test/benchmark/suite/render.go index 46b96e655f..1565e6f9b9 100644 --- a/test/benchmark/suite/render.go +++ b/test/benchmark/suite/render.go @@ -89,9 +89,9 @@ func renderMetricsTable(writer io.Writer, reports []*BenchmarkReport) { headers := []string{ "Test Name", "Envoy Gateway Memory (MiB)\nmin/max/means", - "Envoy Gateway CPU (Seconds)\nmin/max/means", + "Envoy Gateway CPU (%)\nmin/max/means", "Averaged Envoy Proxy Memory (MiB)\nmin/max/means", - "Averaged Envoy Proxy CPU (Seconds)\nmin/max/means", + "Averaged Envoy Proxy CPU (%)\nmin/max/means", } writeTableHeader(table, headers) @@ -209,11 +209,18 @@ func getSamplesMinMaxMeans(samples []BenchmarkMetricSample) []string { } func getMetricsMinMaxMeans(metrics []float64) string { - var min, max, sum float64 = metrics[0], 0, 0 + var min, max, avg float64 = math.MaxFloat64, 0, 0 for _, v := range metrics { min = math.Min(v, min) max = math.Max(v, max) - sum += v + avg += v } - return fmt.Sprintf("%.2f / %.2f / %.2f", min, max, sum/float64(len(metrics))) + if min == math.MaxFloat64 { + min = 0 + } + if len(metrics) > 0 { + avg /= float64(len(metrics)) + } + + return fmt.Sprintf("%.2f / %.2f / %.2f", min, max, avg) } diff --git a/test/benchmark/suite/report.go b/test/benchmark/suite/report.go index 9adf14ee0c..cf0bbcaa66 100644 --- a/test/benchmark/suite/report.go +++ b/test/benchmark/suite/report.go @@ -10,6 +10,7 @@ package suite import ( "bytes" "context" + "errors" "fmt" "io" @@ -24,9 +25,9 @@ import ( const ( controlPlaneMemQL = `process_resident_memory_bytes{namespace="envoy-gateway-system", control_plane="envoy-gateway"}/1024/1024` - controlPlaneCPUQL = `rate(process_cpu_seconds_total{namespace="envoy-gateway-system", control_plane="envoy-gateway"}[1m])` + controlPlaneCPUQL = `rate(process_cpu_seconds_total{namespace="envoy-gateway-system", control_plane="envoy-gateway"}[1m])*100` dataPlaneMemQL = `container_memory_working_set_bytes{namespace="envoy-gateway-system",container="envoy"}/1024/1024` - dataPlaneCPUQL = `rate(container_cpu_usage_seconds_total{namespace="envoy-gateway-system",container="envoy"}[1m])` + dataPlaneCPUQL = `rate(container_cpu_usage_seconds_total{namespace="envoy-gateway-system",container="envoy"}[1m])*100` ) // BenchmarkMetricSample contains sampled metrics and profiles data. @@ -60,19 +61,19 @@ func NewBenchmarkReport(name, profilesOutputDir string, kubeClient kube.CLIClien } } -func (r *BenchmarkReport) Sample(ctx context.Context) error { +func (r *BenchmarkReport) Sample(ctx context.Context) (err error) { sample := BenchmarkMetricSample{} - if err := r.sampleProfiles(ctx, &sample); err != nil { - return err + if mErr := r.sampleMetrics(ctx, &sample); mErr != nil { + err = errors.Join(mErr) } - if err := r.sampleMetrics(ctx, &sample); err != nil { - return err + if pErr := r.sampleProfiles(ctx, &sample); pErr != nil { + err = errors.Join(pErr) } r.Samples = append(r.Samples, sample) - return nil + return err } func (r *BenchmarkReport) GetResult(ctx context.Context, job *types.NamespacedName) error { @@ -104,32 +105,31 @@ func (r *BenchmarkReport) GetResult(ctx context.Context, job *types.NamespacedNa return nil } -func (r *BenchmarkReport) sampleMetrics(ctx context.Context, sample *BenchmarkMetricSample) error { +func (r *BenchmarkReport) sampleMetrics(ctx context.Context, sample *BenchmarkMetricSample) (err error) { // Sample memory - cpMem, err := r.promClient.QuerySum(ctx, controlPlaneMemQL) - if err != nil { - return fmt.Errorf("failed to query control plane memory: %w", err) + cpMem, qErr := r.promClient.QuerySum(ctx, controlPlaneMemQL) + if qErr != nil { + err = errors.Join(fmt.Errorf("failed to query control plane memory: %w", err)) } - dpMem, err := r.promClient.QueryAvg(ctx, dataPlaneMemQL) - if err != nil { - return fmt.Errorf("failed to query data plane memory: %w", err) + dpMem, qErr := r.promClient.QueryAvg(ctx, dataPlaneMemQL) + if qErr != nil { + err = errors.Join(fmt.Errorf("failed to query data plane memory: %w", err)) } - // Sample cpu - cpCPU, err := r.promClient.QuerySum(ctx, controlPlaneCPUQL) - if err != nil { - return fmt.Errorf("failed to query control plane cpu: %w", err) + cpCPU, qErr := r.promClient.QuerySum(ctx, controlPlaneCPUQL) + if qErr != nil { + err = errors.Join(fmt.Errorf("failed to query control plane cpu: %w", err)) } - dpCPU, err := r.promClient.QueryAvg(ctx, dataPlaneCPUQL) - if err != nil { - return fmt.Errorf("failed to query data plane memory: %w", err) + dpCPU, qErr := r.promClient.QueryAvg(ctx, dataPlaneCPUQL) + if qErr != nil { + err = errors.Join(fmt.Errorf("failed to query data plane cpu: %w", err)) } sample.ControlPlaneMem = cpMem sample.ControlPlaneCPU = cpCPU sample.DataPlaneMem = dpMem sample.DataPlaneCPU = dpCPU - return nil + return err } func (r *BenchmarkReport) sampleProfiles(ctx context.Context, sample *BenchmarkMetricSample) error { diff --git a/test/benchmark/suite/suite.go b/test/benchmark/suite/suite.go index ba3b020c49..b41a65cf97 100644 --- a/test/benchmark/suite/suite.go +++ b/test/benchmark/suite/suite.go @@ -229,8 +229,9 @@ func (b *BenchmarkTestSuite) Benchmark(t *testing.T, ctx context.Context, jobNam t.Logf("Job %s still not complete", jobName) // Sample the metrics and profiles at runtime. + // Do not consider it as an error, fail sampling should not affect test running. if err := report.Sample(ctx); err != nil { - t.Errorf("Failed to sample metrics or profiles: %v", err) + t.Logf("Error occurs while sampling metrics or profiles: %v", err) } return false, nil From 36114e803119cc9d86fb9bf2e71cf71444accef1 Mon Sep 17 00:00:00 2001 From: shawnh2 Date: Sat, 22 Mar 2025 18:46:44 +0800 Subject: [PATCH 4/5] optimze report Signed-off-by: shawnh2 --- test/benchmark/suite/render.go | 13 +++++++------ test/benchmark/suite/report.go | 6 +++--- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/test/benchmark/suite/render.go b/test/benchmark/suite/render.go index 1565e6f9b9..20056860de 100644 --- a/test/benchmark/suite/render.go +++ b/test/benchmark/suite/render.go @@ -28,11 +28,12 @@ func RenderReport(writer io.Writer, name, description string, titleLevel int, re return err } - writeSection(writer, "Metrics", titleLevel+1, "") + writeSection(writer, "Metrics", titleLevel+1, + "The CPU usage statistics of both control-plane and data-plane are the CPU usage per second over the past 30 seconds.") renderMetricsTable(writer, reports) writeSection(writer, "Profiles", titleLevel+1, renderProfilesNote()) - renderProfilesTable(writer, "Heap/Memory", "heap", titleLevel+2, reports) + renderProfilesTable(writer, "Heap", "heap", titleLevel+2, reports) return nil } @@ -88,10 +89,10 @@ func renderMetricsTable(writer io.Writer, reports []*BenchmarkReport) { // write headers headers := []string{ "Test Name", - "Envoy Gateway Memory (MiB)\nmin/max/means", - "Envoy Gateway CPU (%)\nmin/max/means", - "Averaged Envoy Proxy Memory (MiB)\nmin/max/means", - "Averaged Envoy Proxy CPU (%)\nmin/max/means", + "Envoy Gateway Memory (MiB)
min/max/means", + "Envoy Gateway CPU (%)
min/max/means", + "Averaged Envoy Proxy Memory (MiB)
min/max/means", + "Averaged Envoy Proxy CPU (%)
min/max/means", } writeTableHeader(table, headers) diff --git a/test/benchmark/suite/report.go b/test/benchmark/suite/report.go index cf0bbcaa66..1ff4750d44 100644 --- a/test/benchmark/suite/report.go +++ b/test/benchmark/suite/report.go @@ -25,9 +25,9 @@ import ( const ( controlPlaneMemQL = `process_resident_memory_bytes{namespace="envoy-gateway-system", control_plane="envoy-gateway"}/1024/1024` - controlPlaneCPUQL = `rate(process_cpu_seconds_total{namespace="envoy-gateway-system", control_plane="envoy-gateway"}[1m])*100` - dataPlaneMemQL = `container_memory_working_set_bytes{namespace="envoy-gateway-system",container="envoy"}/1024/1024` - dataPlaneCPUQL = `rate(container_cpu_usage_seconds_total{namespace="envoy-gateway-system",container="envoy"}[1m])*100` + controlPlaneCPUQL = `rate(process_cpu_seconds_total{namespace="envoy-gateway-system", control_plane="envoy-gateway"}[30s])*100` + dataPlaneMemQL = `container_memory_working_set_bytes{namespace="envoy-gateway-system", container="envoy"}/1024/1024` + dataPlaneCPUQL = `rate(container_cpu_usage_seconds_total{namespace="envoy-gateway-system", container="envoy"}[30s])*100` ) // BenchmarkMetricSample contains sampled metrics and profiles data. From 8e76c30168bdcc10b0fd8421c0c30fafa20e2375 Mon Sep 17 00:00:00 2001 From: sh2 Date: Tue, 25 Mar 2025 09:47:05 +0800 Subject: [PATCH 5/5] update typo in suite.go Co-authored-by: Arko Dasgupta Signed-off-by: sh2 --- test/benchmark/suite/suite.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/benchmark/suite/suite.go b/test/benchmark/suite/suite.go index b41a65cf97..e8a045bec1 100644 --- a/test/benchmark/suite/suite.go +++ b/test/benchmark/suite/suite.go @@ -243,7 +243,7 @@ func (b *BenchmarkTestSuite) Benchmark(t *testing.T, ctx context.Context, jobNam t.Logf("Running benchmark test: %s successfully", resultTitle) - // Get nighthaw result from this benchmark test run. + // Get nighthawk result from this benchmark test run. if err = report.GetResult(ctx, jobNN); err != nil { return nil, err }