envoyproxy · arkodg · Apr 24, 2025 · Apr 14, 2025 · arkodg · Apr 23, 2025
@@ -24,6 +24,8 @@
 	egv1a1 "github.com/envoyproxy/gateway/api/v1alpha1"
 	"github.com/envoyproxy/gateway/internal/envoygateway/config"
 	log "github.com/envoyproxy/gateway/internal/logging"
+	"github.com/envoyproxy/gateway/internal/metrics/restclient"
+	"github.com/envoyproxy/gateway/internal/metrics/workqueue"
 )
 
 const (
@@ -89,8 +91,14 @@
 		newOpts.pullOptions.disable = true
 	} else {
 		newOpts.pullOptions.disable = false
+		restclient.RegisterClientMetricsWithoutRequestTotal(metricsserver.Registry)
+		// Workqueue metrics are already registered in controller-runtime. Use another registry.
+		reg := prometheus.NewRegistry()
+		workqueue.RegisterMetrics(reg)
 		newOpts.pullOptions.registry = metricsserver.Registry
-		newOpts.pullOptions.gatherer = metricsserver.Registry
+		newOpts.pullOptions.gatherer = prometheus.Gatherers{
+			metricsserver.Registry, reg,
+		}
 	}
 
 	for _, config := range svr.EnvoyGateway.GetEnvoyGatewayTelemetry().Metrics.Sinks {

@@ -0,0 +1,127 @@
+// Copyright Envoy Gateway Authors
+// SPDX-License-Identifier: Apache-2.0
+// The full text of the Apache license is available in the LICENSE file at
+// the root of the repo.
+
+package restclient
+
+import (
+	"context"
+	"net/url"
+	"time"
+
+	"github.com/prometheus/client_golang/prometheus"
+	clientmetrics "k8s.io/client-go/tools/metrics"
+)
+
+var (
+	// requestLatency is a Prometheus Histogram metric type partitioned by
+	// "verb", and "host" labels. It is used for the rest client latency metrics.
+	requestLatency = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Name:    "rest_client_request_duration_seconds",
+			Help:    "Request latency in seconds. Broken down by verb, and host.",
+			Buckets: []float64{0.005, 0.025, 0.1, 0.25, 0.5, 1.0, 2.0, 4.0, 8.0, 15.0, 30.0, 60.0},
+		},
+		[]string{"verb", "host"},
+	)
+
+	requestSize = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Name: "rest_client_request_size_bytes",
+			Help: "Request size in bytes. Broken down by verb and host.",
+			// 64 bytes to 16MB
+			Buckets: []float64{64, 256, 512, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216},
+		},
+		[]string{"verb", "host"},
+	)
+
+	responseSize = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Name: "rest_client_response_size_bytes",
+			Help: "Response size in bytes. Broken down by verb and host.",
+			// 64 bytes to 16MB
+			Buckets: []float64{64, 256, 512, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216},
+		},
+		[]string{"verb", "host"},
+	)
+
+	// RateLimiterLatency is the client side rate limiter latency metric.
+	rateLimiterLatency = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Name:    "rest_client_rate_limiter_duration_seconds",
+			Help:    "Rate limiter latency in seconds. Broken down by verb, and host.",
+			Buckets: []float64{0.005, 0.025, 0.1, 0.25, 0.5, 1.0, 2.0, 4.0, 8.0, 15.0, 30.0, 60.0},
+		},
+		[]string{"verb", "host"},
+	)
+
+	// RequestRetry is the retry metric that tracks the number of
+	// retries sent to the server.
+	requestRetry = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Name: "rest_client_requests_retry_total",
+			Help: "Number of HTTP requests retry, partitioned by status code, method, and host.",
+		},
+		[]string{"code", "method", "host"},
+	)
+)
+
+// RegisterClientMetrics for controller-runtime sets up the client latency metrics from client-go.
+func RegisterClientMetricsWithoutRequestTotal(registry prometheus.Registerer) {
+	// register the metrics with our registry
+	registry.MustRegister(requestLatency)
+	registry.MustRegister(requestSize)
+	registry.MustRegister(responseSize)
+	registry.MustRegister(rateLimiterLatency)
+	registry.MustRegister(requestRetry)
+
+	// register the metrics with client-go
+	clientmetrics.RequestLatency = &latencyAdapter{metric: requestLatency}
+	clientmetrics.RequestSize = &sizeAdapter{metric: requestSize}
+	clientmetrics.ResponseSize = &sizeAdapter{metric: responseSize}
+	clientmetrics.RateLimiterLatency = &latencyAdapter{metric: rateLimiterLatency}
+	clientmetrics.RequestRetry = &retryAdapter{metric: requestRetry}
+}
+
+// this section contains adapters, implementations, and other sundry organic, artisanally
+// hand-crafted syntax trees required to convince client-go that it actually wants to let
+// someone use its metrics.
+
+// Client metrics adapters (method #1 for client-go metrics),
+// copied (more-or-less directly) from k8s.io/kubernetes setup code
+// (which isn't anywhere in an easily-importable place).
+
+// latencyAdapter implements LatencyMetric.
+type latencyAdapter struct {
+	metric *prometheus.HistogramVec
+}
+
+// Observe increments the request latency metric for the given verb/URL.
+func (l *latencyAdapter) Observe(_ context.Context, verb string, u url.URL, latency time.Duration) {
+	l.metric.WithLabelValues(verb, u.String()).Observe(latency.Seconds())
+}
+
+type sizeAdapter struct {
+	metric *prometheus.HistogramVec
+}
+
+func (s *sizeAdapter) Observe(_ context.Context, verb, host string, size float64) {
+	s.metric.WithLabelValues(verb, host).Observe(size)
+}
+
+type ResultAdapter struct {
+	metric *prometheus.CounterVec
+}
+
+func (r *ResultAdapter) Increment(_ context.Context, code, method, host string) {
+	r.metric.WithLabelValues(code, method, host).Inc()
+}
+
+type retryAdapter struct {
+	metric *prometheus.CounterVec
+}
+
+func (r *retryAdapter) IncrementRetry(_ context.Context, code, method, host string) {
+	r.metric.WithLabelValues(code, method, host).Inc()
+}
@@ -0,0 +1,151 @@
+// Copyright Envoy Gateway Authors
+// SPDX-License-Identifier: Apache-2.0
+// The full text of the Apache license is available in the LICENSE file at
+// the root of the repo.
+
+package workqueue
+
+import (
+	"strconv"
+	"time"
+
+	"github.com/prometheus/client_golang/prometheus"
+	"k8s.io/client-go/util/workqueue"
+	"sigs.k8s.io/controller-runtime/pkg/metrics"
+)
+
+// This file is copied and adapted from k8s.io/component-base/metrics/prometheus/workqueue
+// which registers metrics to the k8s legacy Registry. We require very
+// similar functionality, but must register metrics to a different Registry.
+
+// Metrics subsystem and all keys used by the workqueue.
+const (
+	WorkQueueSubsystem         = metrics.WorkQueueSubsystem
+	DepthKey                   = metrics.DepthKey
+	AddsKey                    = metrics.AddsKey
+	QueueLatencyKey            = metrics.QueueLatencyKey
+	WorkDurationKey            = metrics.WorkDurationKey
+	UnfinishedWorkKey          = metrics.UnfinishedWorkKey
+	LongestRunningProcessorKey = metrics.LongestRunningProcessorKey
+	RetriesKey                 = metrics.RetriesKey
+)
+
+var (
+	depth = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+		Subsystem: WorkQueueSubsystem,
+		Name:      DepthKey,
+		Help:      "Current depth of workqueue by workqueue and priority",
+	}, []string{"name", "controller", "priority"})
+
+	adds = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Subsystem: WorkQueueSubsystem,
+		Name:      AddsKey,
+		Help:      "Total number of adds handled by workqueue",
+	}, []string{"name", "controller"})
+
+	latency = prometheus.NewHistogramVec(prometheus.HistogramOpts{
+		Subsystem:                       WorkQueueSubsystem,
+		Name:                            QueueLatencyKey,
+		Help:                            "How long in seconds an item stays in workqueue before being requested",
+		Buckets:                         prometheus.ExponentialBuckets(10e-9, 10, 12),
+		NativeHistogramBucketFactor:     1.1,
+		NativeHistogramMaxBucketNumber:  100,
+		NativeHistogramMinResetDuration: 1 * time.Hour,
+	}, []string{"name", "controller"})
+
+	workDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
+		Subsystem:                       WorkQueueSubsystem,
+		Name:                            WorkDurationKey,
+		Help:                            "How long in seconds processing an item from workqueue takes.",
+		Buckets:                         prometheus.ExponentialBuckets(10e-9, 10, 12),
+		NativeHistogramBucketFactor:     1.1,
+		NativeHistogramMaxBucketNumber:  100,
+		NativeHistogramMinResetDuration: 1 * time.Hour,
+	}, []string{"name", "controller"})
+
+	unfinished = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+		Subsystem: WorkQueueSubsystem,
+		Name:      UnfinishedWorkKey,
+		Help: "How many seconds of work has been done that " +
+			"is in progress and hasn't been observed by work_duration. Large " +
+			"values indicate stuck threads. One can deduce the number of stuck " +
+			"threads by observing the rate at which this increases.",
+	}, []string{"name", "controller"})
+
+	longestRunningProcessor = prometheus.NewGaugeVec(prometheus.GaugeOpts{
+		Subsystem: WorkQueueSubsystem,
+		Name:      LongestRunningProcessorKey,
+		Help: "How many seconds has the longest running " +
+			"processor for workqueue been running.",
+	}, []string{"name", "controller"})
+
+	retries = prometheus.NewCounterVec(prometheus.CounterOpts{
+		Subsystem: WorkQueueSubsystem,
+		Name:      RetriesKey,
+		Help:      "Total number of retries handled by workqueue",
+	}, []string{"name", "controller"})
+)
+
+func RegisterMetrics(registry prometheus.Registerer) {
+	registry.MustRegister(depth, adds, latency, workDuration, unfinished, longestRunningProcessor, retries)
+}
+
+type WorkqueueMetricsProvider struct{}
+
+func (WorkqueueMetricsProvider) NewDepthMetric(name string) workqueue.GaugeMetric {
+	return depth.WithLabelValues(name, name, "") // no priority
+}
+
+func (WorkqueueMetricsProvider) NewAddsMetric(name string) workqueue.CounterMetric {
+	return adds.WithLabelValues(name, name)
+}
+
+func (WorkqueueMetricsProvider) NewLatencyMetric(name string) workqueue.HistogramMetric {
+	return latency.WithLabelValues(name, name)
+}
+
+func (WorkqueueMetricsProvider) NewWorkDurationMetric(name string) workqueue.HistogramMetric {
+	return workDuration.WithLabelValues(name, name)
+}
+
+func (WorkqueueMetricsProvider) NewUnfinishedWorkSecondsMetric(name string) workqueue.SettableGaugeMetric {
+	return unfinished.WithLabelValues(name, name)
+}
+
+func (WorkqueueMetricsProvider) NewLongestRunningProcessorSecondsMetric(name string) workqueue.SettableGaugeMetric {
+	return longestRunningProcessor.WithLabelValues(name, name)
+}
+
+func (WorkqueueMetricsProvider) NewRetriesMetric(name string) workqueue.CounterMetric {
+	return retries.WithLabelValues(name, name)
+}
+
+type MetricsProviderWithPriority interface {
+	workqueue.MetricsProvider
+
+	NewDepthMetricWithPriority(name string) DepthMetricWithPriority
+}
+
+// DepthMetricWithPriority represents a depth metric with priority.
+type DepthMetricWithPriority interface {
+	Inc(priority int)
+	Dec(priority int)
+}
+
+var _ MetricsProviderWithPriority = WorkqueueMetricsProvider{}
+
+func (WorkqueueMetricsProvider) NewDepthMetricWithPriority(name string) DepthMetricWithPriority {
+	return &depthWithPriorityMetric{lvs: []string{name, name}}
+}
+
+type depthWithPriorityMetric struct {
+	lvs []string
+}
+
+func (g *depthWithPriorityMetric) Inc(priority int) {
+	depth.WithLabelValues(append(g.lvs, strconv.Itoa(priority))...).Inc()
+}
+
+func (g *depthWithPriorityMetric) Dec(priority int) {
+	depth.WithLabelValues(append(g.lvs, strconv.Itoa(priority))...).Dec()
+}
@@ -21,6 +21,7 @@ import (
 	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/apimachinery/pkg/util/sets"
 	"k8s.io/client-go/discovery"
+	"k8s.io/client-go/util/workqueue"
 	"k8s.io/utils/ptr"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/controller"
@@ -43,6 +44,7 @@ import (
 	"github.com/envoyproxy/gateway/internal/gatewayapi/status"
 	"github.com/envoyproxy/gateway/internal/logging"
 	"github.com/envoyproxy/gateway/internal/message"
+	workqueuemetrics "github.com/envoyproxy/gateway/internal/metrics/workqueue"
 	"github.com/envoyproxy/gateway/internal/utils"
 	"github.com/envoyproxy/gateway/internal/utils/slice"
 	"github.com/envoyproxy/gateway/internal/xds/bootstrap"
@@ -121,7 +123,16 @@ func newGatewayAPIController(ctx context.Context, mgr manager.Manager, cfg *conf
 	// controller-runtime doesn't allow run controller with same name for more than once
 	// see https://github.com/kubernetes-sigs/controller-runtime/blob/2b941650bce159006c88bd3ca0d132c7bc40e947/pkg/controller/name.go#L29
 	name := fmt.Sprintf("gatewayapi-%d", time.Now().Unix())
-	c, err := controller.New(name, mgr, controller.Options{Reconciler: r, SkipNameValidation: skipNameValidation()})
+	c, err := controller.New(name, mgr, controller.Options{
+		Reconciler:         r,
+		SkipNameValidation: skipNameValidation(),
+		NewQueue: func(controllerName string, rateLimiter workqueue.TypedRateLimiter[reconcile.Request]) workqueue.TypedRateLimitingInterface[reconcile.Request] {
+			return workqueue.NewTypedRateLimitingQueueWithConfig(rateLimiter, workqueue.TypedRateLimitingQueueConfig[reconcile.Request]{
+				Name:            controllerName,
+				MetricsProvider: workqueuemetrics.WorkqueueMetricsProvider{},
+			})
+		},
+	})
 	if err != nil {
 		return fmt.Errorf("error creating controller: %w", err)
 	}