diff --git a/internal/metrics/register.go b/internal/metrics/register.go index 1f4c0a483f..ebbc6ee218 100644 --- a/internal/metrics/register.go +++ b/internal/metrics/register.go @@ -24,6 +24,8 @@ import ( egv1a1 "github.com/envoyproxy/gateway/api/v1alpha1" "github.com/envoyproxy/gateway/internal/envoygateway/config" log "github.com/envoyproxy/gateway/internal/logging" + "github.com/envoyproxy/gateway/internal/metrics/restclient" + "github.com/envoyproxy/gateway/internal/metrics/workqueue" ) const ( @@ -89,8 +91,14 @@ func newOptions(svr *config.Server) (registerOptions, error) { newOpts.pullOptions.disable = true } else { newOpts.pullOptions.disable = false + restclient.RegisterClientMetricsWithoutRequestTotal(metricsserver.Registry) + // Workqueue metrics are already registered in controller-runtime. Use another registry. + reg := prometheus.NewRegistry() + workqueue.RegisterMetrics(reg) newOpts.pullOptions.registry = metricsserver.Registry - newOpts.pullOptions.gatherer = metricsserver.Registry + newOpts.pullOptions.gatherer = prometheus.Gatherers{ + metricsserver.Registry, reg, + } } for _, config := range svr.EnvoyGateway.GetEnvoyGatewayTelemetry().Metrics.Sinks { diff --git a/internal/metrics/restclient/metrics.go b/internal/metrics/restclient/metrics.go new file mode 100644 index 0000000000..fb35b4b1c5 --- /dev/null +++ b/internal/metrics/restclient/metrics.go @@ -0,0 +1,127 @@ +// Copyright Envoy Gateway Authors +// SPDX-License-Identifier: Apache-2.0 +// The full text of the Apache license is available in the LICENSE file at +// the root of the repo. + +package restclient + +import ( + "context" + "net/url" + "time" + + "github.com/prometheus/client_golang/prometheus" + clientmetrics "k8s.io/client-go/tools/metrics" +) + +var ( + // requestLatency is a Prometheus Histogram metric type partitioned by + // "verb", and "host" labels. It is used for the rest client latency metrics. + requestLatency = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "rest_client_request_duration_seconds", + Help: "Request latency in seconds. Broken down by verb, and host.", + Buckets: []float64{0.005, 0.025, 0.1, 0.25, 0.5, 1.0, 2.0, 4.0, 8.0, 15.0, 30.0, 60.0}, + }, + []string{"verb", "host"}, + ) + + requestSize = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "rest_client_request_size_bytes", + Help: "Request size in bytes. Broken down by verb and host.", + // 64 bytes to 16MB + Buckets: []float64{64, 256, 512, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216}, + }, + []string{"verb", "host"}, + ) + + responseSize = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "rest_client_response_size_bytes", + Help: "Response size in bytes. Broken down by verb and host.", + // 64 bytes to 16MB + Buckets: []float64{64, 256, 512, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216}, + }, + []string{"verb", "host"}, + ) + + // RateLimiterLatency is the client side rate limiter latency metric. + rateLimiterLatency = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: "rest_client_rate_limiter_duration_seconds", + Help: "Rate limiter latency in seconds. Broken down by verb, and host.", + Buckets: []float64{0.005, 0.025, 0.1, 0.25, 0.5, 1.0, 2.0, 4.0, 8.0, 15.0, 30.0, 60.0}, + }, + []string{"verb", "host"}, + ) + + // RequestRetry is the retry metric that tracks the number of + // retries sent to the server. + requestRetry = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "rest_client_requests_retry_total", + Help: "Number of HTTP requests retry, partitioned by status code, method, and host.", + }, + []string{"code", "method", "host"}, + ) +) + +// RegisterClientMetrics for controller-runtime sets up the client latency metrics from client-go. +func RegisterClientMetricsWithoutRequestTotal(registry prometheus.Registerer) { + // register the metrics with our registry + registry.MustRegister(requestLatency) + registry.MustRegister(requestSize) + registry.MustRegister(responseSize) + registry.MustRegister(rateLimiterLatency) + registry.MustRegister(requestRetry) + + // register the metrics with client-go + clientmetrics.RequestLatency = &latencyAdapter{metric: requestLatency} + clientmetrics.RequestSize = &sizeAdapter{metric: requestSize} + clientmetrics.ResponseSize = &sizeAdapter{metric: responseSize} + clientmetrics.RateLimiterLatency = &latencyAdapter{metric: rateLimiterLatency} + clientmetrics.RequestRetry = &retryAdapter{metric: requestRetry} +} + +// this section contains adapters, implementations, and other sundry organic, artisanally +// hand-crafted syntax trees required to convince client-go that it actually wants to let +// someone use its metrics. + +// Client metrics adapters (method #1 for client-go metrics), +// copied (more-or-less directly) from k8s.io/kubernetes setup code +// (which isn't anywhere in an easily-importable place). + +// latencyAdapter implements LatencyMetric. +type latencyAdapter struct { + metric *prometheus.HistogramVec +} + +// Observe increments the request latency metric for the given verb/URL. +func (l *latencyAdapter) Observe(_ context.Context, verb string, u url.URL, latency time.Duration) { + l.metric.WithLabelValues(verb, u.String()).Observe(latency.Seconds()) +} + +type sizeAdapter struct { + metric *prometheus.HistogramVec +} + +func (s *sizeAdapter) Observe(_ context.Context, verb, host string, size float64) { + s.metric.WithLabelValues(verb, host).Observe(size) +} + +type ResultAdapter struct { + metric *prometheus.CounterVec +} + +func (r *ResultAdapter) Increment(_ context.Context, code, method, host string) { + r.metric.WithLabelValues(code, method, host).Inc() +} + +type retryAdapter struct { + metric *prometheus.CounterVec +} + +func (r *retryAdapter) IncrementRetry(_ context.Context, code, method, host string) { + r.metric.WithLabelValues(code, method, host).Inc() +} diff --git a/internal/metrics/workqueue/metrics.go b/internal/metrics/workqueue/metrics.go new file mode 100644 index 0000000000..1543beaa16 --- /dev/null +++ b/internal/metrics/workqueue/metrics.go @@ -0,0 +1,151 @@ +// Copyright Envoy Gateway Authors +// SPDX-License-Identifier: Apache-2.0 +// The full text of the Apache license is available in the LICENSE file at +// the root of the repo. + +package workqueue + +import ( + "strconv" + "time" + + "github.com/prometheus/client_golang/prometheus" + "k8s.io/client-go/util/workqueue" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +// This file is copied and adapted from k8s.io/component-base/metrics/prometheus/workqueue +// which registers metrics to the k8s legacy Registry. We require very +// similar functionality, but must register metrics to a different Registry. + +// Metrics subsystem and all keys used by the workqueue. +const ( + WorkQueueSubsystem = metrics.WorkQueueSubsystem + DepthKey = metrics.DepthKey + AddsKey = metrics.AddsKey + QueueLatencyKey = metrics.QueueLatencyKey + WorkDurationKey = metrics.WorkDurationKey + UnfinishedWorkKey = metrics.UnfinishedWorkKey + LongestRunningProcessorKey = metrics.LongestRunningProcessorKey + RetriesKey = metrics.RetriesKey +) + +var ( + depth = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Subsystem: WorkQueueSubsystem, + Name: DepthKey, + Help: "Current depth of workqueue by workqueue and priority", + }, []string{"name", "controller", "priority"}) + + adds = prometheus.NewCounterVec(prometheus.CounterOpts{ + Subsystem: WorkQueueSubsystem, + Name: AddsKey, + Help: "Total number of adds handled by workqueue", + }, []string{"name", "controller"}) + + latency = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Subsystem: WorkQueueSubsystem, + Name: QueueLatencyKey, + Help: "How long in seconds an item stays in workqueue before being requested", + Buckets: prometheus.ExponentialBuckets(10e-9, 10, 12), + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: 1 * time.Hour, + }, []string{"name", "controller"}) + + workDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Subsystem: WorkQueueSubsystem, + Name: WorkDurationKey, + Help: "How long in seconds processing an item from workqueue takes.", + Buckets: prometheus.ExponentialBuckets(10e-9, 10, 12), + NativeHistogramBucketFactor: 1.1, + NativeHistogramMaxBucketNumber: 100, + NativeHistogramMinResetDuration: 1 * time.Hour, + }, []string{"name", "controller"}) + + unfinished = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Subsystem: WorkQueueSubsystem, + Name: UnfinishedWorkKey, + Help: "How many seconds of work has been done that " + + "is in progress and hasn't been observed by work_duration. Large " + + "values indicate stuck threads. One can deduce the number of stuck " + + "threads by observing the rate at which this increases.", + }, []string{"name", "controller"}) + + longestRunningProcessor = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Subsystem: WorkQueueSubsystem, + Name: LongestRunningProcessorKey, + Help: "How many seconds has the longest running " + + "processor for workqueue been running.", + }, []string{"name", "controller"}) + + retries = prometheus.NewCounterVec(prometheus.CounterOpts{ + Subsystem: WorkQueueSubsystem, + Name: RetriesKey, + Help: "Total number of retries handled by workqueue", + }, []string{"name", "controller"}) +) + +func RegisterMetrics(registry prometheus.Registerer) { + registry.MustRegister(depth, adds, latency, workDuration, unfinished, longestRunningProcessor, retries) +} + +type WorkqueueMetricsProvider struct{} + +func (WorkqueueMetricsProvider) NewDepthMetric(name string) workqueue.GaugeMetric { + return depth.WithLabelValues(name, name, "") // no priority +} + +func (WorkqueueMetricsProvider) NewAddsMetric(name string) workqueue.CounterMetric { + return adds.WithLabelValues(name, name) +} + +func (WorkqueueMetricsProvider) NewLatencyMetric(name string) workqueue.HistogramMetric { + return latency.WithLabelValues(name, name) +} + +func (WorkqueueMetricsProvider) NewWorkDurationMetric(name string) workqueue.HistogramMetric { + return workDuration.WithLabelValues(name, name) +} + +func (WorkqueueMetricsProvider) NewUnfinishedWorkSecondsMetric(name string) workqueue.SettableGaugeMetric { + return unfinished.WithLabelValues(name, name) +} + +func (WorkqueueMetricsProvider) NewLongestRunningProcessorSecondsMetric(name string) workqueue.SettableGaugeMetric { + return longestRunningProcessor.WithLabelValues(name, name) +} + +func (WorkqueueMetricsProvider) NewRetriesMetric(name string) workqueue.CounterMetric { + return retries.WithLabelValues(name, name) +} + +type MetricsProviderWithPriority interface { + workqueue.MetricsProvider + + NewDepthMetricWithPriority(name string) DepthMetricWithPriority +} + +// DepthMetricWithPriority represents a depth metric with priority. +type DepthMetricWithPriority interface { + Inc(priority int) + Dec(priority int) +} + +var _ MetricsProviderWithPriority = WorkqueueMetricsProvider{} + +func (WorkqueueMetricsProvider) NewDepthMetricWithPriority(name string) DepthMetricWithPriority { + return &depthWithPriorityMetric{lvs: []string{name, name}} +} + +type depthWithPriorityMetric struct { + lvs []string +} + +func (g *depthWithPriorityMetric) Inc(priority int) { + depth.WithLabelValues(append(g.lvs, strconv.Itoa(priority))...).Inc() +} + +func (g *depthWithPriorityMetric) Dec(priority int) { + depth.WithLabelValues(append(g.lvs, strconv.Itoa(priority))...).Dec() +} diff --git a/internal/provider/kubernetes/controller.go b/internal/provider/kubernetes/controller.go index b4ba3aee49..12b6c13f3b 100644 --- a/internal/provider/kubernetes/controller.go +++ b/internal/provider/kubernetes/controller.go @@ -21,6 +21,7 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/client-go/discovery" + "k8s.io/client-go/util/workqueue" "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" @@ -43,6 +44,7 @@ import ( "github.com/envoyproxy/gateway/internal/gatewayapi/status" "github.com/envoyproxy/gateway/internal/logging" "github.com/envoyproxy/gateway/internal/message" + workqueuemetrics "github.com/envoyproxy/gateway/internal/metrics/workqueue" "github.com/envoyproxy/gateway/internal/utils" "github.com/envoyproxy/gateway/internal/utils/slice" "github.com/envoyproxy/gateway/internal/xds/bootstrap" @@ -121,7 +123,16 @@ func newGatewayAPIController(ctx context.Context, mgr manager.Manager, cfg *conf // controller-runtime doesn't allow run controller with same name for more than once // see https://github.com/kubernetes-sigs/controller-runtime/blob/2b941650bce159006c88bd3ca0d132c7bc40e947/pkg/controller/name.go#L29 name := fmt.Sprintf("gatewayapi-%d", time.Now().Unix()) - c, err := controller.New(name, mgr, controller.Options{Reconciler: r, SkipNameValidation: skipNameValidation()}) + c, err := controller.New(name, mgr, controller.Options{ + Reconciler: r, + SkipNameValidation: skipNameValidation(), + NewQueue: func(controllerName string, rateLimiter workqueue.TypedRateLimiter[reconcile.Request]) workqueue.TypedRateLimitingInterface[reconcile.Request] { + return workqueue.NewTypedRateLimitingQueueWithConfig(rateLimiter, workqueue.TypedRateLimitingQueueConfig[reconcile.Request]{ + Name: controllerName, + MetricsProvider: workqueuemetrics.WorkqueueMetricsProvider{}, + }) + }, + }) if err != nil { return fmt.Errorf("error creating controller: %w", err) } diff --git a/test/e2e/tests/metric.go b/test/e2e/tests/metric.go index 3c7fa2d4f9..cbfb848a47 100644 --- a/test/e2e/tests/metric.go +++ b/test/e2e/tests/metric.go @@ -15,6 +15,7 @@ import ( "testing" "time" + "github.com/stretchr/testify/require" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/wait" httputils "sigs.k8s.io/gateway-api/conformance/utils/http" @@ -23,10 +24,11 @@ import ( "sigs.k8s.io/gateway-api/conformance/utils/tlog" egv1a1 "github.com/envoyproxy/gateway/api/v1alpha1" + "github.com/envoyproxy/gateway/test/utils/prometheus" ) func init() { - ConformanceTests = append(ConformanceTests, MetricTest, MetricCompressorTest) + ConformanceTests = append(ConformanceTests, MetricTest, MetricWorkqueueAndRestclientTest, MetricCompressorTest) } var MetricTest = suite.ConformanceTest{ @@ -99,6 +101,43 @@ var MetricTest = suite.ConformanceTest{ }, } +var MetricWorkqueueAndRestclientTest = suite.ConformanceTest{ + ShortName: "MetricWorkqueueAndRestclientTest", + Description: "Ensure workqueue and restclient metrics are exposed", + Test: func(t *testing.T, suite *suite.ConformanceTestSuite) { + ctx := context.Background() + promClient, err := prometheus.NewClient(suite.Client, + types.NamespacedName{Name: "prometheus", Namespace: "monitoring"}, + ) + require.NoError(t, err) + + verifyMetrics := func(t *testing.T, metricQuery string, metricName string) { + httputils.AwaitConvergence( + t, + suite.TimeoutConfig.RequiredConsecutiveSuccesses, + suite.TimeoutConfig.MaxTimeToConsistency, + func(_ time.Duration) bool { + v, err := promClient.QuerySum(ctx, metricQuery) + if err != nil { + tlog.Logf(t, "failed to get %s metrics: %v", metricName, err) + return false + } + tlog.Logf(t, "%s metrics query count: %v", metricName, v) + return true + }, + ) + } + + t.Run("verify workqueue metrics", func(t *testing.T) { + verifyMetrics(t, `workqueue_adds_total{namespace="envoy-gateway-system"}`, "workqueue") + }) + + t.Run("verify restclient metrics", func(t *testing.T) { + verifyMetrics(t, `rest_client_request_duration_seconds_sum{namespace="envoy-gateway-system"}`, "restclient") + }) + }, +} + var MetricCompressorTest = suite.ConformanceTest{ ShortName: "MetricCompressor", Description: "Make sure metric is working with compressor",