Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion internal/metrics/register.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
egv1a1 "github.com/envoyproxy/gateway/api/v1alpha1"
"github.com/envoyproxy/gateway/internal/envoygateway/config"
log "github.com/envoyproxy/gateway/internal/logging"
"github.com/envoyproxy/gateway/internal/metrics/restclient"
"github.com/envoyproxy/gateway/internal/metrics/workqueue"
)

const (
Expand Down Expand Up @@ -89,8 +91,14 @@
newOpts.pullOptions.disable = true
} else {
newOpts.pullOptions.disable = false
restclient.RegisterClientMetricsWithoutRequestTotal(metricsserver.Registry)
// Workqueue metrics are already registered in controller-runtime. Use another registry.
reg := prometheus.NewRegistry()
workqueue.RegisterMetrics(reg)

Check warning on line 97 in internal/metrics/register.go

View check run for this annotation

Codecov / codecov/patch

internal/metrics/register.go#L94-L97

Added lines #L94 - L97 were not covered by tests
newOpts.pullOptions.registry = metricsserver.Registry
newOpts.pullOptions.gatherer = metricsserver.Registry
newOpts.pullOptions.gatherer = prometheus.Gatherers{
metricsserver.Registry, reg,
}

Check warning on line 101 in internal/metrics/register.go

View check run for this annotation

Codecov / codecov/patch

internal/metrics/register.go#L99-L101

Added lines #L99 - L101 were not covered by tests
}

for _, config := range svr.EnvoyGateway.GetEnvoyGatewayTelemetry().Metrics.Sinks {
Expand Down
127 changes: 127 additions & 0 deletions internal/metrics/restclient/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
// Copyright Envoy Gateway Authors
// SPDX-License-Identifier: Apache-2.0
// The full text of the Apache license is available in the LICENSE file at
// the root of the repo.

package restclient

import (
"context"
"net/url"
"time"

"github.com/prometheus/client_golang/prometheus"
clientmetrics "k8s.io/client-go/tools/metrics"
)

var (
// requestLatency is a Prometheus Histogram metric type partitioned by
// "verb", and "host" labels. It is used for the rest client latency metrics.
requestLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "rest_client_request_duration_seconds",
Help: "Request latency in seconds. Broken down by verb, and host.",
Buckets: []float64{0.005, 0.025, 0.1, 0.25, 0.5, 1.0, 2.0, 4.0, 8.0, 15.0, 30.0, 60.0},
},
[]string{"verb", "host"},
)

requestSize = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "rest_client_request_size_bytes",
Help: "Request size in bytes. Broken down by verb and host.",
// 64 bytes to 16MB
Buckets: []float64{64, 256, 512, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216},
},
[]string{"verb", "host"},
)

responseSize = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "rest_client_response_size_bytes",
Help: "Response size in bytes. Broken down by verb and host.",
// 64 bytes to 16MB
Buckets: []float64{64, 256, 512, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216},
},
[]string{"verb", "host"},
)

// RateLimiterLatency is the client side rate limiter latency metric.
rateLimiterLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "rest_client_rate_limiter_duration_seconds",
Help: "Rate limiter latency in seconds. Broken down by verb, and host.",
Buckets: []float64{0.005, 0.025, 0.1, 0.25, 0.5, 1.0, 2.0, 4.0, 8.0, 15.0, 30.0, 60.0},
},
[]string{"verb", "host"},
)

// RequestRetry is the retry metric that tracks the number of
// retries sent to the server.
requestRetry = prometheus.NewCounterVec(
prometheus.CounterOpts{
Name: "rest_client_requests_retry_total",
Help: "Number of HTTP requests retry, partitioned by status code, method, and host.",
},
[]string{"code", "method", "host"},
)
)

// RegisterClientMetrics for controller-runtime sets up the client latency metrics from client-go.
func RegisterClientMetricsWithoutRequestTotal(registry prometheus.Registerer) {
// register the metrics with our registry
registry.MustRegister(requestLatency)
registry.MustRegister(requestSize)
registry.MustRegister(responseSize)
registry.MustRegister(rateLimiterLatency)
registry.MustRegister(requestRetry)

// register the metrics with client-go
clientmetrics.RequestLatency = &latencyAdapter{metric: requestLatency}
clientmetrics.RequestSize = &sizeAdapter{metric: requestSize}
clientmetrics.ResponseSize = &sizeAdapter{metric: responseSize}
clientmetrics.RateLimiterLatency = &latencyAdapter{metric: rateLimiterLatency}
clientmetrics.RequestRetry = &retryAdapter{metric: requestRetry}

Check warning on line 84 in internal/metrics/restclient/metrics.go

View check run for this annotation

Codecov / codecov/patch

internal/metrics/restclient/metrics.go#L71-L84

Added lines #L71 - L84 were not covered by tests
}

// this section contains adapters, implementations, and other sundry organic, artisanally
// hand-crafted syntax trees required to convince client-go that it actually wants to let
// someone use its metrics.

// Client metrics adapters (method #1 for client-go metrics),
// copied (more-or-less directly) from k8s.io/kubernetes setup code
// (which isn't anywhere in an easily-importable place).

// latencyAdapter implements LatencyMetric.
type latencyAdapter struct {
metric *prometheus.HistogramVec
}

// Observe increments the request latency metric for the given verb/URL.
func (l *latencyAdapter) Observe(_ context.Context, verb string, u url.URL, latency time.Duration) {
l.metric.WithLabelValues(verb, u.String()).Observe(latency.Seconds())

Check warning on line 102 in internal/metrics/restclient/metrics.go

View check run for this annotation

Codecov / codecov/patch

internal/metrics/restclient/metrics.go#L101-L102

Added lines #L101 - L102 were not covered by tests
}

type sizeAdapter struct {
metric *prometheus.HistogramVec
}

func (s *sizeAdapter) Observe(_ context.Context, verb, host string, size float64) {
s.metric.WithLabelValues(verb, host).Observe(size)

Check warning on line 110 in internal/metrics/restclient/metrics.go

View check run for this annotation

Codecov / codecov/patch

internal/metrics/restclient/metrics.go#L109-L110

Added lines #L109 - L110 were not covered by tests
}

type ResultAdapter struct {
metric *prometheus.CounterVec
}

func (r *ResultAdapter) Increment(_ context.Context, code, method, host string) {
r.metric.WithLabelValues(code, method, host).Inc()

Check warning on line 118 in internal/metrics/restclient/metrics.go

View check run for this annotation

Codecov / codecov/patch

internal/metrics/restclient/metrics.go#L117-L118

Added lines #L117 - L118 were not covered by tests
}

type retryAdapter struct {
metric *prometheus.CounterVec
}

func (r *retryAdapter) IncrementRetry(_ context.Context, code, method, host string) {
r.metric.WithLabelValues(code, method, host).Inc()

Check warning on line 126 in internal/metrics/restclient/metrics.go

View check run for this annotation

Codecov / codecov/patch

internal/metrics/restclient/metrics.go#L125-L126

Added lines #L125 - L126 were not covered by tests
}
151 changes: 151 additions & 0 deletions internal/metrics/workqueue/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
// Copyright Envoy Gateway Authors
// SPDX-License-Identifier: Apache-2.0
// The full text of the Apache license is available in the LICENSE file at
// the root of the repo.

package workqueue

import (
"strconv"
"time"

"github.com/prometheus/client_golang/prometheus"
"k8s.io/client-go/util/workqueue"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

// This file is copied and adapted from k8s.io/component-base/metrics/prometheus/workqueue
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

@arkodg arkodg Apr 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reread the title description, trying to better understand the conflict

Copy link
Contributor Author

@Inode1 Inode1 Apr 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, this and this both attempt to initialize workqueue metrics. But only one provider can be registered — enforced here in client-go. Component-base wins in our case. EG dont use PQ

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks

// which registers metrics to the k8s legacy Registry. We require very
// similar functionality, but must register metrics to a different Registry.

// Metrics subsystem and all keys used by the workqueue.
const (
WorkQueueSubsystem = metrics.WorkQueueSubsystem
DepthKey = metrics.DepthKey
AddsKey = metrics.AddsKey
QueueLatencyKey = metrics.QueueLatencyKey
WorkDurationKey = metrics.WorkDurationKey
UnfinishedWorkKey = metrics.UnfinishedWorkKey
LongestRunningProcessorKey = metrics.LongestRunningProcessorKey
RetriesKey = metrics.RetriesKey
)

var (
depth = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Subsystem: WorkQueueSubsystem,
Name: DepthKey,
Help: "Current depth of workqueue by workqueue and priority",
}, []string{"name", "controller", "priority"})

adds = prometheus.NewCounterVec(prometheus.CounterOpts{
Subsystem: WorkQueueSubsystem,
Name: AddsKey,
Help: "Total number of adds handled by workqueue",
}, []string{"name", "controller"})

latency = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Subsystem: WorkQueueSubsystem,
Name: QueueLatencyKey,
Help: "How long in seconds an item stays in workqueue before being requested",
Buckets: prometheus.ExponentialBuckets(10e-9, 10, 12),
NativeHistogramBucketFactor: 1.1,
NativeHistogramMaxBucketNumber: 100,
NativeHistogramMinResetDuration: 1 * time.Hour,
}, []string{"name", "controller"})

workDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Subsystem: WorkQueueSubsystem,
Name: WorkDurationKey,
Help: "How long in seconds processing an item from workqueue takes.",
Buckets: prometheus.ExponentialBuckets(10e-9, 10, 12),
NativeHistogramBucketFactor: 1.1,
NativeHistogramMaxBucketNumber: 100,
NativeHistogramMinResetDuration: 1 * time.Hour,
}, []string{"name", "controller"})

unfinished = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Subsystem: WorkQueueSubsystem,
Name: UnfinishedWorkKey,
Help: "How many seconds of work has been done that " +
"is in progress and hasn't been observed by work_duration. Large " +
"values indicate stuck threads. One can deduce the number of stuck " +
"threads by observing the rate at which this increases.",
}, []string{"name", "controller"})

longestRunningProcessor = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Subsystem: WorkQueueSubsystem,
Name: LongestRunningProcessorKey,
Help: "How many seconds has the longest running " +
"processor for workqueue been running.",
}, []string{"name", "controller"})

retries = prometheus.NewCounterVec(prometheus.CounterOpts{
Subsystem: WorkQueueSubsystem,
Name: RetriesKey,
Help: "Total number of retries handled by workqueue",
}, []string{"name", "controller"})
)

func RegisterMetrics(registry prometheus.Registerer) {
registry.MustRegister(depth, adds, latency, workDuration, unfinished, longestRunningProcessor, retries)

Check warning on line 90 in internal/metrics/workqueue/metrics.go

View check run for this annotation

Codecov / codecov/patch

internal/metrics/workqueue/metrics.go#L89-L90

Added lines #L89 - L90 were not covered by tests
}

type WorkqueueMetricsProvider struct{}

func (WorkqueueMetricsProvider) NewDepthMetric(name string) workqueue.GaugeMetric {
return depth.WithLabelValues(name, name, "") // no priority

Check warning on line 96 in internal/metrics/workqueue/metrics.go

View check run for this annotation

Codecov / codecov/patch

internal/metrics/workqueue/metrics.go#L95-L96

Added lines #L95 - L96 were not covered by tests
}

func (WorkqueueMetricsProvider) NewAddsMetric(name string) workqueue.CounterMetric {
return adds.WithLabelValues(name, name)

Check warning on line 100 in internal/metrics/workqueue/metrics.go

View check run for this annotation

Codecov / codecov/patch

internal/metrics/workqueue/metrics.go#L99-L100

Added lines #L99 - L100 were not covered by tests
}

func (WorkqueueMetricsProvider) NewLatencyMetric(name string) workqueue.HistogramMetric {
return latency.WithLabelValues(name, name)

Check warning on line 104 in internal/metrics/workqueue/metrics.go

View check run for this annotation

Codecov / codecov/patch

internal/metrics/workqueue/metrics.go#L103-L104

Added lines #L103 - L104 were not covered by tests
}

func (WorkqueueMetricsProvider) NewWorkDurationMetric(name string) workqueue.HistogramMetric {
return workDuration.WithLabelValues(name, name)

Check warning on line 108 in internal/metrics/workqueue/metrics.go

View check run for this annotation

Codecov / codecov/patch

internal/metrics/workqueue/metrics.go#L107-L108

Added lines #L107 - L108 were not covered by tests
}

func (WorkqueueMetricsProvider) NewUnfinishedWorkSecondsMetric(name string) workqueue.SettableGaugeMetric {
return unfinished.WithLabelValues(name, name)

Check warning on line 112 in internal/metrics/workqueue/metrics.go

View check run for this annotation

Codecov / codecov/patch

internal/metrics/workqueue/metrics.go#L111-L112

Added lines #L111 - L112 were not covered by tests
}

func (WorkqueueMetricsProvider) NewLongestRunningProcessorSecondsMetric(name string) workqueue.SettableGaugeMetric {
return longestRunningProcessor.WithLabelValues(name, name)

Check warning on line 116 in internal/metrics/workqueue/metrics.go

View check run for this annotation

Codecov / codecov/patch

internal/metrics/workqueue/metrics.go#L115-L116

Added lines #L115 - L116 were not covered by tests
}

func (WorkqueueMetricsProvider) NewRetriesMetric(name string) workqueue.CounterMetric {
return retries.WithLabelValues(name, name)

Check warning on line 120 in internal/metrics/workqueue/metrics.go

View check run for this annotation

Codecov / codecov/patch

internal/metrics/workqueue/metrics.go#L119-L120

Added lines #L119 - L120 were not covered by tests
}

type MetricsProviderWithPriority interface {
workqueue.MetricsProvider

NewDepthMetricWithPriority(name string) DepthMetricWithPriority
}

// DepthMetricWithPriority represents a depth metric with priority.
type DepthMetricWithPriority interface {
Inc(priority int)
Dec(priority int)
}

var _ MetricsProviderWithPriority = WorkqueueMetricsProvider{}

func (WorkqueueMetricsProvider) NewDepthMetricWithPriority(name string) DepthMetricWithPriority {
return &depthWithPriorityMetric{lvs: []string{name, name}}

Check warning on line 138 in internal/metrics/workqueue/metrics.go

View check run for this annotation

Codecov / codecov/patch

internal/metrics/workqueue/metrics.go#L137-L138

Added lines #L137 - L138 were not covered by tests
}

type depthWithPriorityMetric struct {
lvs []string
}

func (g *depthWithPriorityMetric) Inc(priority int) {
depth.WithLabelValues(append(g.lvs, strconv.Itoa(priority))...).Inc()

Check warning on line 146 in internal/metrics/workqueue/metrics.go

View check run for this annotation

Codecov / codecov/patch

internal/metrics/workqueue/metrics.go#L145-L146

Added lines #L145 - L146 were not covered by tests
}

func (g *depthWithPriorityMetric) Dec(priority int) {
depth.WithLabelValues(append(g.lvs, strconv.Itoa(priority))...).Dec()

Check warning on line 150 in internal/metrics/workqueue/metrics.go

View check run for this annotation

Codecov / codecov/patch

internal/metrics/workqueue/metrics.go#L149-L150

Added lines #L149 - L150 were not covered by tests
}
13 changes: 12 additions & 1 deletion internal/provider/kubernetes/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/discovery"
"k8s.io/client-go/util/workqueue"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller"
Expand All @@ -43,6 +44,7 @@ import (
"github.com/envoyproxy/gateway/internal/gatewayapi/status"
"github.com/envoyproxy/gateway/internal/logging"
"github.com/envoyproxy/gateway/internal/message"
workqueuemetrics "github.com/envoyproxy/gateway/internal/metrics/workqueue"
"github.com/envoyproxy/gateway/internal/utils"
"github.com/envoyproxy/gateway/internal/utils/slice"
"github.com/envoyproxy/gateway/internal/xds/bootstrap"
Expand Down Expand Up @@ -121,7 +123,16 @@ func newGatewayAPIController(ctx context.Context, mgr manager.Manager, cfg *conf
// controller-runtime doesn't allow run controller with same name for more than once
// see https://github.com/kubernetes-sigs/controller-runtime/blob/2b941650bce159006c88bd3ca0d132c7bc40e947/pkg/controller/name.go#L29
name := fmt.Sprintf("gatewayapi-%d", time.Now().Unix())
c, err := controller.New(name, mgr, controller.Options{Reconciler: r, SkipNameValidation: skipNameValidation()})
c, err := controller.New(name, mgr, controller.Options{
Reconciler: r,
SkipNameValidation: skipNameValidation(),
NewQueue: func(controllerName string, rateLimiter workqueue.TypedRateLimiter[reconcile.Request]) workqueue.TypedRateLimitingInterface[reconcile.Request] {
return workqueue.NewTypedRateLimitingQueueWithConfig(rateLimiter, workqueue.TypedRateLimitingQueueConfig[reconcile.Request]{
Name: controllerName,
MetricsProvider: workqueuemetrics.WorkqueueMetricsProvider{},
})
},
})
if err != nil {
return fmt.Errorf("error creating controller: %w", err)
}
Expand Down
Loading