diff --git a/test/benchmark/suite/render.go b/test/benchmark/suite/render.go index 413d02502d..20056860de 100644 --- a/test/benchmark/suite/render.go +++ b/test/benchmark/suite/render.go @@ -11,59 +11,14 @@ import ( "bytes" "fmt" "io" + "math" "os" - "strconv" + "path" + "sort" "strings" "text/tabwriter" ) -const ( - omitEmptyValue = "-" - benchmarkEnvPrefix = "BENCHMARK_" - - querySum = "Sum" - queryAvg = "Avg" - queryMin = "Min" - queryMax = "Max" -) - -type tableHeader struct { - name string - unit string - promQL string // only valid for metrics table - queryType string -} - -var metricsTableHeader = []tableHeader{ - { - name: "Test Name", - }, - { - name: "Envoy Gateway Memory", - unit: "MiB", - promQL: `process_resident_memory_bytes{namespace="envoy-gateway-system", control_plane="envoy-gateway"}/1024/1024`, - queryType: querySum, - }, - { - name: "Envoy Gateway CPU", - unit: "s", - promQL: `process_cpu_seconds_total{namespace="envoy-gateway-system", control_plane="envoy-gateway"}`, - queryType: querySum, - }, - { - name: "Envoy Proxy Memory (Avg)", - unit: "MiB", - promQL: `container_memory_working_set_bytes{namespace="envoy-gateway-system",container="envoy"}/1024/1024`, - queryType: queryAvg, - }, - { - name: "Envoy Proxy CPU (Avg)", - unit: "s", - promQL: `container_cpu_usage_seconds_total{namespace="envoy-gateway-system",container="envoy"}`, - queryType: queryAvg, - }, -} - // RenderReport renders a report out of given list of benchmark report in Markdown format. func RenderReport(writer io.Writer, name, description string, titleLevel int, reports []*BenchmarkReport) error { writeSection(writer, "Test: "+name, titleLevel, description) @@ -73,11 +28,12 @@ func RenderReport(writer io.Writer, name, description string, titleLevel int, re return err } - writeSection(writer, "Metrics", titleLevel+1, "") + writeSection(writer, "Metrics", titleLevel+1, + "The CPU usage statistics of both control-plane and data-plane are the CPU usage per second over the past 30 seconds.") renderMetricsTable(writer, reports) writeSection(writer, "Profiles", titleLevel+1, renderProfilesNote()) - renderProfilesTable(writer, "Memory", "heap", titleLevel+2, reports) + renderProfilesTable(writer, "Heap", "heap", titleLevel+2, reports) return nil } @@ -90,22 +46,23 @@ func newMarkdownStyleTableWriter(writer io.Writer) *tabwriter.Writer { func renderEnvSettingsTable(writer io.Writer) { table := newMarkdownStyleTableWriter(writer) - headers := []tableHeader{ - {name: "RPS"}, - {name: "Connections"}, - {name: "Duration", unit: "s"}, - {name: "CPU Limits", unit: "m"}, - {name: "Memory Limits", unit: "MiB"}, + headers := []string{ + "RPS", + "Connections", + "Duration (Seconds)", + "CPU Limits (m)", + "Memory Limits (MiB)", } writeTableHeader(table, headers) - writeTableRow(table, headers, func(_ int, h tableHeader) string { - env := strings.ReplaceAll(strings.ToUpper(h.name), " ", "_") - if v, ok := os.LookupEnv(benchmarkEnvPrefix + env); ok { - return v - } - return omitEmptyValue - }) + data := []string{ + os.Getenv("BENCHMARK_RPS"), + os.Getenv("BENCHMARK_CONNECTIONS"), + os.Getenv("BENCHMARK_DURATION"), + os.Getenv("BENCHMARK_CPU_LIMITS"), + os.Getenv("BENCHMARK_MEMORY_LIMITS"), + } + writeTableRow(table, data) _ = table.Flush() } @@ -129,20 +86,20 @@ func renderResultsTable(writer io.Writer, reports []*BenchmarkReport) error { func renderMetricsTable(writer io.Writer, reports []*BenchmarkReport) { table := newMarkdownStyleTableWriter(writer) - writeTableHeader(table, metricsTableHeader) + // write headers + headers := []string{ + "Test Name", + "Envoy Gateway Memory (MiB)
min/max/means", + "Envoy Gateway CPU (%)
min/max/means", + "Averaged Envoy Proxy Memory (MiB)
min/max/means", + "Averaged Envoy Proxy CPU (%)
min/max/means", + } + writeTableHeader(table, headers) for _, report := range reports { - writeTableRow(table, metricsTableHeader, func(_ int, h tableHeader) string { - if len(h.promQL) == 0 { - return report.Name - } - - if v, ok := report.Metrics[h.name]; ok { - return strconv.FormatFloat(v, 'f', -1, 64) - } - - return omitEmptyValue - }) + data := []string{report.Name} + data = append(data, getSamplesMinMaxMeans(report.Samples)...) + writeTableRow(table, data) } _ = table.Flush() @@ -156,19 +113,32 @@ You can visualize them in a web page by running: %s Currently, the supported profile types are: -- heap +- heap (memory) `, "`/profiles`", "`{ProfileType}.{TestCase}.pprof`", "```shell\ngo tool pprof -http=: path/to/your.pprof\n```") } func renderProfilesTable(writer io.Writer, target, key string, titleLevel int, reports []*BenchmarkReport) { - writeSection(writer, target, titleLevel, "") + writeSection(writer, target, titleLevel, + "The profiles were sampled when Envoy Gateway Memory is at its maximum.") for _, report := range reports { + // Get the heap profile when control plane memory is at its maximum. + sortedSamples := make([]BenchmarkMetricSample, len(report.Samples)) + copy(sortedSamples, report.Samples) + sort.Slice(sortedSamples, func(i, j int) bool { + return sortedSamples[i].ControlPlaneMem > sortedSamples[j].ControlPlaneMem + }) + + heapPprof := sortedSamples[0].HeapProfile + heapPprofPath := path.Join(report.ProfilesOutputDir, fmt.Sprintf("heap.%s.pprof", report.Name)) + _ = os.WriteFile(heapPprofPath, heapPprof, 0o600) + // The image is not be rendered yet, so it is a placeholder for the path. // The image will be rendered after the test has finished. + rootDir := strings.SplitN(heapPprofPath, "/", 2)[0] + heapPprofPath = strings.TrimPrefix(heapPprofPath, rootDir+"/") writeSection(writer, report.Name, titleLevel+1, - fmt.Sprintf("![%s-%s](%s.png)", key, report.Name, - strings.TrimSuffix(report.ProfilesPath[key], ".pprof"))) + fmt.Sprintf("![%s-%s](%s.png)", key, report.Name, strings.TrimSuffix(heapPprofPath, ".pprof"))) } } @@ -194,21 +164,16 @@ func writeCollapsibleSection(writer io.Writer, title string, content []byte) { `, title, summary) } -func writeTableHeader(table *tabwriter.Writer, headers []tableHeader) { - writeTableRow(table, headers, func(_ int, h tableHeader) string { - if len(h.unit) > 0 { - return fmt.Sprintf("%s (%s)", h.name, h.unit) - } - return h.name - }) +func writeTableHeader(table *tabwriter.Writer, headers []string) { + writeTableRow(table, headers) writeTableDelimiter(table, len(headers)) } -// writeTableRow writes one row in Markdown table style according to headers. -func writeTableRow(table *tabwriter.Writer, headers []tableHeader, on func(int, tableHeader) string) { +// writeTableRow writes one row in Markdown table style. +func writeTableRow(table *tabwriter.Writer, data []string) { row := "|" - for i, v := range headers { - row += on(i, v) + "\t" + for _, v := range data { + row += v + "\t" } _, _ = fmt.Fprintln(table, row) @@ -223,3 +188,40 @@ func writeTableDelimiter(table *tabwriter.Writer, n int) { _, _ = fmt.Fprintln(table, sep) } + +func getSamplesMinMaxMeans(samples []BenchmarkMetricSample) []string { + cpMem := make([]float64, 0, len(samples)) + cpCPU := make([]float64, 0, len(samples)) + dpMem := make([]float64, 0, len(samples)) + dpCPU := make([]float64, 0, len(samples)) + for _, sample := range samples { + cpMem = append(cpMem, sample.ControlPlaneMem) + cpCPU = append(cpCPU, sample.ControlPlaneCPU) + dpMem = append(dpMem, sample.DataPlaneMem) + dpCPU = append(dpCPU, sample.DataPlaneCPU) + } + + return []string{ + getMetricsMinMaxMeans(cpMem), + getMetricsMinMaxMeans(cpCPU), + getMetricsMinMaxMeans(dpMem), + getMetricsMinMaxMeans(dpCPU), + } +} + +func getMetricsMinMaxMeans(metrics []float64) string { + var min, max, avg float64 = math.MaxFloat64, 0, 0 + for _, v := range metrics { + min = math.Min(v, min) + max = math.Max(v, max) + avg += v + } + if min == math.MaxFloat64 { + min = 0 + } + if len(metrics) > 0 { + avg /= float64(len(metrics)) + } + + return fmt.Sprintf("%.2f / %.2f / %.2f", min, max, avg) +} diff --git a/test/benchmark/suite/report.go b/test/benchmark/suite/report.go index b159e79860..1ff4750d44 100644 --- a/test/benchmark/suite/report.go +++ b/test/benchmark/suite/report.go @@ -10,12 +10,9 @@ package suite import ( "bytes" "context" + "errors" "fmt" "io" - "os" - "path" - "strconv" - "strings" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -26,46 +23,57 @@ import ( prom "github.com/envoyproxy/gateway/test/utils/prometheus" ) +const ( + controlPlaneMemQL = `process_resident_memory_bytes{namespace="envoy-gateway-system", control_plane="envoy-gateway"}/1024/1024` + controlPlaneCPUQL = `rate(process_cpu_seconds_total{namespace="envoy-gateway-system", control_plane="envoy-gateway"}[30s])*100` + dataPlaneMemQL = `container_memory_working_set_bytes{namespace="envoy-gateway-system", container="envoy"}/1024/1024` + dataPlaneCPUQL = `rate(container_cpu_usage_seconds_total{namespace="envoy-gateway-system", container="envoy"}[30s])*100` +) + +// BenchmarkMetricSample contains sampled metrics and profiles data. +type BenchmarkMetricSample struct { + ControlPlaneMem float64 + ControlPlaneCPU float64 + DataPlaneMem float64 + DataPlaneCPU float64 + + HeapProfile []byte +} + type BenchmarkReport struct { Name string - Result []byte - Metrics map[string]float64 // metricTableHeaderName:metricValue - ProfilesPath map[string]string // profileKey:profileFilepath ProfilesOutputDir string + // Nighthawk benchmark result + Result []byte + // Prometheus metrics and pprof profiles sampled data + Samples []BenchmarkMetricSample kubeClient kube.CLIClient promClient *prom.Client } -func NewBenchmarkReport(name, profilesOutputDir string, kubeClient kube.CLIClient, promClient *prom.Client) (*BenchmarkReport, error) { - if err := createDirIfNotExist(profilesOutputDir); err != nil { - return nil, err - } - +func NewBenchmarkReport(name, profilesOutputDir string, kubeClient kube.CLIClient, promClient *prom.Client) *BenchmarkReport { return &BenchmarkReport{ Name: name, - Metrics: make(map[string]float64), - ProfilesPath: make(map[string]string), ProfilesOutputDir: profilesOutputDir, kubeClient: kubeClient, promClient: promClient, - }, nil + } } -func (r *BenchmarkReport) Collect(ctx context.Context, job *types.NamespacedName) error { - if err := r.GetProfiles(ctx); err != nil { - return err - } +func (r *BenchmarkReport) Sample(ctx context.Context) (err error) { + sample := BenchmarkMetricSample{} - if err := r.GetMetrics(ctx); err != nil { - return err + if mErr := r.sampleMetrics(ctx, &sample); mErr != nil { + err = errors.Join(mErr) } - if err := r.GetResult(ctx, job); err != nil { - return err + if pErr := r.sampleProfiles(ctx, &sample); pErr != nil { + err = errors.Join(pErr) } - return nil + r.Samples = append(r.Samples, sample) + return err } func (r *BenchmarkReport) GetResult(ctx context.Context, job *types.NamespacedName) error { @@ -97,34 +105,34 @@ func (r *BenchmarkReport) GetResult(ctx context.Context, job *types.NamespacedNa return nil } -func (r *BenchmarkReport) GetMetrics(ctx context.Context) error { - for _, h := range metricsTableHeader { - if len(h.promQL) == 0 { - continue - } - - var ( - v float64 - err error - ) - switch h.queryType { - case querySum: - v, err = r.promClient.QuerySum(ctx, h.promQL) - case queryAvg: - v, err = r.promClient.QueryAvg(ctx, h.promQL) - default: - return fmt.Errorf("unsupported query type: %s", h.queryType) - } - - if err == nil { - r.Metrics[h.name], _ = strconv.ParseFloat(fmt.Sprintf("%.2f", v), 64) - } +func (r *BenchmarkReport) sampleMetrics(ctx context.Context, sample *BenchmarkMetricSample) (err error) { + // Sample memory + cpMem, qErr := r.promClient.QuerySum(ctx, controlPlaneMemQL) + if qErr != nil { + err = errors.Join(fmt.Errorf("failed to query control plane memory: %w", err)) + } + dpMem, qErr := r.promClient.QueryAvg(ctx, dataPlaneMemQL) + if qErr != nil { + err = errors.Join(fmt.Errorf("failed to query data plane memory: %w", err)) + } + // Sample cpu + cpCPU, qErr := r.promClient.QuerySum(ctx, controlPlaneCPUQL) + if qErr != nil { + err = errors.Join(fmt.Errorf("failed to query control plane cpu: %w", err)) + } + dpCPU, qErr := r.promClient.QueryAvg(ctx, dataPlaneCPUQL) + if qErr != nil { + err = errors.Join(fmt.Errorf("failed to query data plane cpu: %w", err)) } - return nil + sample.ControlPlaneMem = cpMem + sample.ControlPlaneCPU = cpCPU + sample.DataPlaneMem = dpMem + sample.DataPlaneCPU = dpCPU + return err } -func (r *BenchmarkReport) GetProfiles(ctx context.Context) error { +func (r *BenchmarkReport) sampleProfiles(ctx context.Context, sample *BenchmarkMetricSample) error { egPod, err := r.fetchEnvoyGatewayPod(ctx) if err != nil { return err @@ -138,16 +146,7 @@ func (r *BenchmarkReport) GetProfiles(ctx context.Context) error { return err } - heapProfPath := path.Join(r.ProfilesOutputDir, fmt.Sprintf("heap.%s.pprof", r.Name)) - if err = os.WriteFile(heapProfPath, heapProf, 0o600); err != nil { - return fmt.Errorf("failed to write profiles %s: %w", heapProfPath, err) - } - - // Remove parent output report dir. - splits := strings.SplitN(heapProfPath, "/", 2)[0] - heapProfPath = strings.TrimPrefix(heapProfPath, splits+"/") - r.ProfilesPath["heap"] = heapProfPath - + sample.HeapProfile = heapProf return nil } diff --git a/test/benchmark/suite/suite.go b/test/benchmark/suite/suite.go index 80a6ff1daf..e8a045bec1 100644 --- a/test/benchmark/suite/suite.go +++ b/test/benchmark/suite/suite.go @@ -32,9 +32,10 @@ import ( ) const ( - BenchmarkTestScaledKey = "benchmark-test/scaled" - BenchmarkTestClientKey = "benchmark-test/client" - DefaultControllerName = "gateway.envoyproxy.io/gatewayclass-controller" + BenchmarkTestScaledKey = "benchmark-test/scaled" + BenchmarkTestClientKey = "benchmark-test/client" + BenchmarkMetricsSampleTick = 3 * time.Second + DefaultControllerName = "gateway.envoyproxy.io/gatewayclass-controller" ) type BenchmarkTest struct { @@ -200,8 +201,14 @@ func (b *BenchmarkTestSuite) Benchmark(t *testing.T, ctx context.Context, jobNam return nil, err } + profilesOutputDir := path.Join(b.ReportSaveDir, "profiles") + if err := createDirIfNotExist(profilesOutputDir); err != nil { + return nil, err + } + // Wait from benchmark test job to complete. - if err = wait.PollUntilContextTimeout(ctx, 6*time.Second, time.Duration(duration*10)*time.Second, true, func(ctx context.Context) (bool, error) { + report := NewBenchmarkReport(resultTitle, profilesOutputDir, b.kubeClient, b.promClient) + if err = wait.PollUntilContextTimeout(ctx, BenchmarkMetricsSampleTick, time.Duration(duration*10)*time.Second, true, func(ctx context.Context) (bool, error) { job := new(batchv1.Job) if err = b.Client.Get(ctx, *jobNN, job); err != nil { return false, err @@ -221,6 +228,12 @@ func (b *BenchmarkTestSuite) Benchmark(t *testing.T, ctx context.Context, jobNam t.Logf("Job %s still not complete", jobName) + // Sample the metrics and profiles at runtime. + // Do not consider it as an error, fail sampling should not affect test running. + if err := report.Sample(ctx); err != nil { + t.Logf("Error occurs while sampling metrics or profiles: %v", err) + } + return false, nil }); err != nil { t.Errorf("Failed to run benchmark test: %v", err) @@ -230,13 +243,8 @@ func (b *BenchmarkTestSuite) Benchmark(t *testing.T, ctx context.Context, jobNam t.Logf("Running benchmark test: %s successfully", resultTitle) - report, err := NewBenchmarkReport(resultTitle, path.Join(b.ReportSaveDir, "profiles"), b.kubeClient, b.promClient) - if err != nil { - return nil, fmt.Errorf("failed to create benchmark report: %w", err) - } - - // Get all the reports from this benchmark test run. - if err = report.Collect(ctx, jobNN); err != nil { + // Get nighthawk result from this benchmark test run. + if err = report.GetResult(ctx, jobNN); err != nil { return nil, err }