diff --git a/go/internal/services/keys/validation.go b/go/internal/services/keys/validation.go index cec8a3c2e6..02c8703bb5 100644 --- a/go/internal/services/keys/validation.go +++ b/go/internal/services/keys/validation.go @@ -4,7 +4,6 @@ import ( "context" "database/sql" "fmt" - "strconv" "strings" "time" @@ -16,7 +15,6 @@ import ( "github.com/unkeyed/unkey/go/pkg/codes" "github.com/unkeyed/unkey/go/pkg/fault" "github.com/unkeyed/unkey/go/pkg/otel/tracing" - "github.com/unkeyed/unkey/go/pkg/prometheus/metrics" "github.com/unkeyed/unkey/go/pkg/ptr" "github.com/unkeyed/unkey/go/pkg/rbac" ) @@ -44,26 +42,6 @@ func (k *KeyVerifier) withCredits(ctx context.Context, cost int32) error { k.setInvalid(StatusUsageExceeded, "Key usage limit exceeded.") } - // Emit Prometheus metrics for credits spent - identityID := "" - if k.Key.IdentityID.Valid { - identityID = k.Key.IdentityID.String - } - - // Credits are deducted when usage is valid AND cost > 0 - deducted := usage.Valid && cost > 0 - actualCostDeducted := int32(0) - if deducted { - actualCostDeducted = cost - } - - metrics.KeyCreditsSpentTotal.WithLabelValues( - k.AuthorizedWorkspaceID, // workspace_id - k.Key.ID, // key_id - identityID, // identity_id - strconv.FormatBool(deducted), // deducted - whether credits were actually deducted - ).Add(float64(actualCostDeducted)) // Add the actual amount deducted, not the requested cost - return nil } diff --git a/go/pkg/circuitbreaker/lib.go b/go/pkg/circuitbreaker/lib.go index f51761045d..55b9a768f7 100644 --- a/go/pkg/circuitbreaker/lib.go +++ b/go/pkg/circuitbreaker/lib.go @@ -9,6 +9,7 @@ import ( "github.com/unkeyed/unkey/go/pkg/clock" "github.com/unkeyed/unkey/go/pkg/otel/logging" "github.com/unkeyed/unkey/go/pkg/otel/tracing" + "github.com/unkeyed/unkey/go/pkg/prometheus/metrics" ) type CB[Res any] struct { @@ -198,7 +199,7 @@ func (cb *CB[Res]) preflight(ctx context.Context) error { cb.resetStateAt = now.Add(cb.config.timeout) } - requests.WithLabelValues(cb.config.name, string(cb.state)).Inc() + metrics.CircuitBreakerRequests.WithLabelValues(cb.config.name, string(cb.state)).Inc() if cb.state == Open { return ErrTripped diff --git a/go/pkg/circuitbreaker/metrics.go b/go/pkg/circuitbreaker/metrics.go deleted file mode 100644 index 490cb0332f..0000000000 --- a/go/pkg/circuitbreaker/metrics.go +++ /dev/null @@ -1,15 +0,0 @@ -package circuitbreaker - -import ( - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" -) - -var ( - requests = promauto.NewCounterVec(prometheus.CounterOpts{ - Namespace: "agent", - Subsystem: "circuitbreaker", - Name: "requests_total", - Help: "Total number of requests processed by circuit breaker", - }, []string{"name", "state"}) -) diff --git a/go/pkg/db/replica.go b/go/pkg/db/replica.go index 70723c8586..1c8941b949 100644 --- a/go/pkg/db/replica.go +++ b/go/pkg/db/replica.go @@ -43,8 +43,8 @@ func (r *Replica) ExecContext(ctx context.Context, query string, args ...interfa status = "error" } - metrics.DatabaseOperationLatency.WithLabelValues(r.mode, "exec", status).Observe(duration) - metrics.DatabaseOperationTotal.WithLabelValues(r.mode, "exec", status).Inc() + metrics.DatabaseOperationsLatency.WithLabelValues(r.mode, "exec", status).Observe(duration) + metrics.DatabaseOperationsTotal.WithLabelValues(r.mode, "exec", status).Inc() return result, err } @@ -69,8 +69,8 @@ func (r *Replica) PrepareContext(ctx context.Context, query string) (*sql.Stmt, status = "error" } - metrics.DatabaseOperationLatency.WithLabelValues(r.mode, "prepare", status).Observe(duration) - metrics.DatabaseOperationTotal.WithLabelValues(r.mode, "prepare", status).Inc() + metrics.DatabaseOperationsLatency.WithLabelValues(r.mode, "prepare", status).Observe(duration) + metrics.DatabaseOperationsTotal.WithLabelValues(r.mode, "prepare", status).Inc() return stmt, err // nolint:sqlclosecheck } @@ -95,8 +95,8 @@ func (r *Replica) QueryContext(ctx context.Context, query string, args ...interf status = "error" } - metrics.DatabaseOperationLatency.WithLabelValues(r.mode, "query", status).Observe(duration) - metrics.DatabaseOperationTotal.WithLabelValues(r.mode, "query", status).Inc() + metrics.DatabaseOperationsLatency.WithLabelValues(r.mode, "query", status).Observe(duration) + metrics.DatabaseOperationsTotal.WithLabelValues(r.mode, "query", status).Inc() return rows, err // nolint:sqlclosecheck } @@ -119,8 +119,8 @@ func (r *Replica) QueryRowContext(ctx context.Context, query string, args ...int // QueryRowContext doesn't return an error, but we can still track timing status := "success" - metrics.DatabaseOperationLatency.WithLabelValues(r.mode, "query_row", status).Observe(duration) - metrics.DatabaseOperationTotal.WithLabelValues(r.mode, "query_row", status).Inc() + metrics.DatabaseOperationsLatency.WithLabelValues(r.mode, "query_row", status).Observe(duration) + metrics.DatabaseOperationsTotal.WithLabelValues(r.mode, "query_row", status).Inc() return row } @@ -143,8 +143,8 @@ func (r *Replica) Begin(ctx context.Context) (*sql.Tx, error) { status = "error" } - metrics.DatabaseOperationLatency.WithLabelValues(r.mode, "begin", status).Observe(duration) - metrics.DatabaseOperationTotal.WithLabelValues(r.mode, "begin", status).Inc() + metrics.DatabaseOperationsLatency.WithLabelValues(r.mode, "begin", status).Observe(duration) + metrics.DatabaseOperationsTotal.WithLabelValues(r.mode, "begin", status).Inc() return tx, err } diff --git a/go/pkg/prometheus/metrics/batch.go b/go/pkg/prometheus/metrics/batch.go index 0287b1367e..7c34610cb5 100644 --- a/go/pkg/prometheus/metrics/batch.go +++ b/go/pkg/prometheus/metrics/batch.go @@ -84,4 +84,21 @@ var ( }, []string{"name"}, ) + + // BatchItemsProcessedErrorsTotal tracks the total number of items that resulted in errors + // during batch processing, labeled by batch name. + // Use this counter to monitor error rates in batch processing and identify problematic batches. + // + // Example usage: + // metrics.BatchItemsProcessedErrorsTotal.WithLabelValues("database_writes").Add(float64(errorCount)) + BatchItemsProcessedErrorsTotal = promauto.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "unkey", + Subsystem: "batch", + Name: "items_processed_errors_total", + Help: "Total number of items processed through batches that resulted in an error", + ConstLabels: constLabels, + }, + []string{"name"}, + ) ) diff --git a/go/pkg/prometheus/metrics/buffer.go b/go/pkg/prometheus/metrics/buffer.go index 444a4c2fef..9b238664a3 100644 --- a/go/pkg/prometheus/metrics/buffer.go +++ b/go/pkg/prometheus/metrics/buffer.go @@ -47,4 +47,20 @@ var ( }, []string{"name", "drop"}, ) + + // BufferErrorsTotal tracks the total number of buffer operation errors, + // labeled by buffer name and error type. Use this counter to monitor buffer error rates. + // + // Example usage: + // metrics.BufferErrorsTotal.WithLabelValues("batch_writer", "write_failed").Inc() + BufferErrorsTotal = promauto.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "unkey", + Subsystem: "buffer", + Name: "errors_total", + Help: "Total number of buffer operation errors by name and state.", + ConstLabels: constLabels, + }, + []string{"name", "state"}, + ) ) diff --git a/go/pkg/prometheus/metrics/cache.go b/go/pkg/prometheus/metrics/cache.go index 4463fd8acb..09fa4abefa 100644 --- a/go/pkg/prometheus/metrics/cache.go +++ b/go/pkg/prometheus/metrics/cache.go @@ -19,6 +19,7 @@ var ( // metrics.CacheHits.WithLabelValues("user_profile") CacheReads = promauto.NewCounterVec( prometheus.CounterOpts{ + Namespace: "unkey", Subsystem: "cache", Name: "reads_total", Help: "Number of cache reads by resource type and hit status.", @@ -35,6 +36,7 @@ var ( // metrics.CacheWrites.WithLabelValues("user_profile").Set(float64(writeCount)) CacheWrites = promauto.NewGaugeVec( prometheus.GaugeOpts{ + Namespace: "unkey", Subsystem: "cache", Name: "writes", Help: "Number of cache writes by resource type.", @@ -52,6 +54,7 @@ var ( // metrics.CacheDeleted.WithLabelValues("user_profile", "capacity").Set(float64(evictionCount)) CacheDeleted = promauto.NewCounterVec( prometheus.CounterOpts{ + Namespace: "unkey", Subsystem: "cache", Name: "deleted_total", Help: "Number of cache entries deleted by resource type and reason.", @@ -67,6 +70,7 @@ var ( // metrics.CacheSize.WithLabelValues("user_profile").Set(float64(cacheSize)) CacheSize = promauto.NewGaugeVec( prometheus.GaugeOpts{ + Namespace: "unkey", Subsystem: "cache", Name: "size", Help: "Current number of entries in the cache by resource type.", @@ -82,6 +86,7 @@ var ( // metrics.CacheCapacity.WithLabelValues("user_profile").Set(float64(cacheCapacity)) CacheCapacity = promauto.NewGaugeVec( prometheus.GaugeOpts{ + Namespace: "unkey", Subsystem: "cache", Name: "capacity", Help: "Maximum capacity of the cache by resource type.", @@ -97,6 +102,7 @@ var ( // metrics.CacheRevalidations.WithLabelValues("user_profile").Inc() CacheRevalidations = promauto.NewCounterVec( prometheus.CounterOpts{ + Namespace: "unkey", Subsystem: "cache", Name: "revalidations_total", Help: "Total number of cache revalidations by resource type.", @@ -104,4 +110,36 @@ var ( }, []string{"resource"}, ) + + // CacheReadsErrorsTotal tracks the total number of cache read errors, + // labeled by resource type. Use this counter to monitor cache read error rates. + // + // Example usage: + // metrics.CacheReadsErrorsTotal.WithLabelValues("user_profile").Inc() + CacheReadsErrorsTotal = promauto.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "unkey", + Subsystem: "cache", + Name: "reads_errors_total", + Help: "Total number of cache read errors by resource type.", + ConstLabels: constLabels, + }, + []string{"resource"}, + ) + + // CacheRevalidationsErrorsTotal tracks the total number of cache revalidation errors, + // labeled by resource type. Use this counter to monitor cache revalidation error rates. + // + // Example usage: + // metrics.CacheRevalidationsErrorsTotal.WithLabelValues("user_profile").Inc() + CacheRevalidationsErrorsTotal = promauto.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "unkey", + Subsystem: "cache", + Name: "revalidations_errors_total", + Help: "Total number of cache revalidation errors by resource type.", + ConstLabels: constLabels, + }, + []string{"resource"}, + ) ) diff --git a/go/pkg/prometheus/metrics/chproxy.go b/go/pkg/prometheus/metrics/chproxy.go index 9d67951c06..9631d41f55 100644 --- a/go/pkg/prometheus/metrics/chproxy.go +++ b/go/pkg/prometheus/metrics/chproxy.go @@ -18,6 +18,7 @@ var ( // metrics.ChproxyRequestsTotal.WithLabelValues("verifications").Inc() ChproxyRequestsTotal = promauto.NewCounterVec( prometheus.CounterOpts{ + Namespace: "unkey", Subsystem: "chproxy", Name: "requests_total", Help: "Total number of ClickHouse proxy requests processed.", @@ -26,6 +27,22 @@ var ( []string{"endpoint"}, ) + // ChproxyErrorsTotal tracks the total number of errors encountered by ClickHouse proxy, + // labeled by endpoint. Use this counter to monitor error rates and identify problematic endpoints. + // + // Example usage: + // metrics.ChproxyErrorsTotal.WithLabelValues("verifications").Inc() + ChproxyErrorsTotal = promauto.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "unkey", + Subsystem: "chproxy", + Name: "errors_total", + Help: "Total number of errors encountered by ClickHouse proxy.", + ConstLabels: constLabels, + }, + []string{"endpoint"}, + ) + // ChproxyRowsTotal tracks the total number of rows/events received in chproxy requests. // Use this counter to monitor data volume and ingestion patterns. // @@ -33,6 +50,7 @@ var ( // metrics.ChproxyRowsTotal.WithLabelValues("verifications").Add(float64(len(events))) ChproxyRowsTotal = promauto.NewCounterVec( prometheus.CounterOpts{ + Namespace: "unkey", Subsystem: "chproxy", Name: "rows_total", Help: "Total number of rows/events processed by ClickHouse proxy.", @@ -40,4 +58,20 @@ var ( }, []string{"endpoint"}, ) + + // ChproxyRowsErrorsTotal tracks the total number of row processing errors in ClickHouse proxy, + // labeled by endpoint. Use this counter to monitor row processing error rates. + // + // Example usage: + // metrics.ChproxyRowsErrorsTotal.WithLabelValues("verifications").Inc() + ChproxyRowsErrorsTotal = promauto.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "unkey", + Subsystem: "chproxy", + Name: "rows_errors_total", + Help: "Total number of row processing errors in ClickHouse proxy.", + ConstLabels: constLabels, + }, + []string{"endpoint"}, + ) ) diff --git a/go/pkg/prometheus/metrics/circuitbreaker.go b/go/pkg/prometheus/metrics/circuitbreaker.go index 533b36eba9..798270809b 100644 --- a/go/pkg/prometheus/metrics/circuitbreaker.go +++ b/go/pkg/prometheus/metrics/circuitbreaker.go @@ -12,9 +12,23 @@ var ( // Example usage: // metrics.CircuitBreakerRequests.WithLabelValues("my_circuit_breaker", "open").Inc() CircuitBreakerRequests = promauto.NewCounterVec(prometheus.CounterOpts{ + Namespace: "unkey", Subsystem: "circuitbreaker", Name: "requests_total", Help: "Tracks the number of requests made to the circuitbreaker by state.", ConstLabels: constLabels, - }, []string{"name", "state"}) + }, []string{"service", "action"}) + + // CircuitBreakerErrorsTotal tracks the total number of circuit breaker errors, + // labeled by service and action. Use this counter to monitor circuit breaker error rates. + // + // Example usage: + // metrics.CircuitBreakerErrorsTotal.WithLabelValues("database", "timeout").Inc() + CircuitBreakerErrorsTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + Namespace: "unkey", + Subsystem: "circuitbreaker", + Name: "errors_total", + Help: "Total number of circuit breaker errors by service and action.", + ConstLabels: constLabels, + }, []string{"service", "action"}) ) diff --git a/go/pkg/prometheus/metrics/database.go b/go/pkg/prometheus/metrics/database.go index 10dc94d6a8..72e8f572bc 100644 --- a/go/pkg/prometheus/metrics/database.go +++ b/go/pkg/prometheus/metrics/database.go @@ -12,19 +12,20 @@ import ( ) var ( - // DatabaseOperationLatency tracks database operation latencies as a histogram, + // DatabaseOperationsLatency tracks database operation latencies as a histogram, // labeled by replica type (rw/ro), operation type, and success status. // This collector uses predefined buckets optimized for typical database operation latencies. // // Example usage: // timer := prometheus.NewTimer(prometheus.ObserverFunc(func(v float64) { - // metrics.DatabaseOperationLatency.WithLabelValues("rw", "exec", "success").Observe(v) + // metrics.DatabaseOperationsLatency.WithLabelValues("rw", "exec", "success").Observe(v) // })) // defer timer.ObserveDuration() - DatabaseOperationLatency = promauto.NewHistogramVec( + DatabaseOperationsLatency = promauto.NewHistogramVec( prometheus.HistogramOpts{ + Namespace: "unkey", Subsystem: "database", - Name: "operation_latency_seconds", + Name: "operations_latency_seconds", Help: "Histogram of database operation latencies in seconds.", Buckets: latencyBuckets, ConstLabels: constLabels, @@ -39,8 +40,9 @@ var ( // Example usage: // metrics.DatabaseOperationTotal.WithLabelValues("rw", "exec", "success").Inc() // metrics.DatabaseOperationTotal.WithLabelValues("ro", "query", "error").Inc() - DatabaseOperationTotal = promauto.NewCounterVec( + DatabaseOperationsTotal = promauto.NewCounterVec( prometheus.CounterOpts{ + Namespace: "unkey", Subsystem: "database", Name: "operations_total", Help: "Total number of database operations processed.", @@ -48,4 +50,18 @@ var ( }, []string{"replica", "operation", "status"}, ) + + // DatabaseOperationsErrorsTotal tracks the total number of database operation errors, + // labeled by replica type (rw/ro), and operation type. + // Use this counter to monitor database error rates and identify problematic operations. + // + // Example usage: + // metrics.DatabaseOperationsErrorsTotal.WithLabelValues("rw", "exec").Inc() + DatabaseOperationsErrorsTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + Namespace: "unkey", + Subsystem: "database", + Name: "operations_errors_total", + Help: "Total number of database operation errors.", + ConstLabels: constLabels, + }, []string{"replica", "operation"}) ) diff --git a/go/pkg/prometheus/metrics/http.go b/go/pkg/prometheus/metrics/http.go index e68763dace..5cc98ab59e 100644 --- a/go/pkg/prometheus/metrics/http.go +++ b/go/pkg/prometheus/metrics/http.go @@ -57,6 +57,7 @@ var ( // defer timer.ObserveDuration() HTTPRequestLatency = promauto.NewHistogramVec( prometheus.HistogramOpts{ + Namespace: "unkey", Subsystem: "http", Name: "request_latency_seconds", Help: "Histogram of HTTP request latencies in seconds.", @@ -73,6 +74,7 @@ var ( // metrics.HTTPRequestTotal.WithLabelValues("GET", "/users", "200").Inc() HTTPRequestTotal = promauto.NewCounterVec( prometheus.CounterOpts{ + Namespace: "unkey", Subsystem: "http", Name: "requests_total", Help: "Total number of HTTP requests processed.", @@ -81,6 +83,22 @@ var ( []string{"method", "path", "status"}, ) + // HTTPRequestErrorTotal tracks the total number of HTTP request errors, + // labeled by method, path, and status. Use this counter to monitor error rates by endpoint. + // + // Example usage: + // metrics.HTTPRequestErrorTotal.WithLabelValues("POST", "/api/keys", "500").Inc() + HTTPRequestErrorTotal = promauto.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "unkey", + Subsystem: "http", + Name: "requests_errors_total", + Help: "Total number of HTTP request errors.", + ConstLabels: constLabels, + }, + []string{"method", "path", "status"}, + ) + // HTTPRequestBodySize tracks the distribution of HTTP request body sizes as a histogram, // labeled by method, path, and status. This helps monitor payload sizes and identify potentially // problematic large requests. @@ -89,6 +107,7 @@ var ( // metrics.HTTPRequestBodySize.WithLabelValues("POST", "/api/upload", "200").Observe(float64(bodySize)) HTTPRequestBodySize = promauto.NewHistogramVec( prometheus.HistogramOpts{ + Namespace: "unkey", Subsystem: "http", Name: "request_body_size_bytes", Help: "Histogram of HTTP request body sizes in bytes.", diff --git a/go/pkg/prometheus/metrics/keys.go b/go/pkg/prometheus/metrics/keys.go index bf059b7ffe..bb846c43fb 100644 --- a/go/pkg/prometheus/metrics/keys.go +++ b/go/pkg/prometheus/metrics/keys.go @@ -13,12 +13,13 @@ import ( var ( // KeyVerificationsTotal tracks the number of key verifications handled, labeled by type and outcome. // The type should be either "root_key" or "key" - // Use this counter to monitor API traffic patterns and error rates. + // Use this counter to monitor API traffic patterns. // // Example usage: // metrics.KeyVerificationsTotal.WithLabelValues("root_key", "VALID").Inc() KeyVerificationsTotal = promauto.NewCounterVec( prometheus.CounterOpts{ + Namespace: "unkey", Subsystem: "key", Name: "verifications_total", Help: "Total number of Key verifications processed.", @@ -27,17 +28,21 @@ var ( []string{"type", "code"}, ) - // KeyCreditsSpentTotal tracks the total credits spent by keys, labeled by workspace ID, key ID, and identity ID. - // Use this counter to monitor credit usage patterns and error rates. + // KeyVerificationErrorsTotal tracks the number of errors in key verifications. + // These are not errors in the keys themselves like "FORBIDDEN", or "RATE_LIMITED" but errors in + // program functionality. Use this with the unkey_key_verifications_total metric to calculate + // the error rate. // // Example usage: - // metrics.KeyCreditsSpentTotal.WithLabelValues("ws_1234", "key_abcd", "identity_xyz", "true").Add(5) - KeyCreditsSpentTotal = promauto.NewCounterVec( + // metrics.KeyVerificationErrorsTotal.WithLabelValues("root_key").Inc() + KeyVerificationErrorsTotal = promauto.NewCounterVec( prometheus.CounterOpts{ - Subsystem: "key", - Name: "credits_spent_total", - Help: "Total credits spent by keys", + Namespace: "unkey", + Subsystem: "key", + Name: "verification_errors_total", + Help: "Total number of key verification errors", + ConstLabels: constLabels, }, - []string{"workspace_id", "key_id", "identity_id", "deducted"}, + []string{"type"}, ) ) diff --git a/go/pkg/prometheus/metrics/panic.go b/go/pkg/prometheus/metrics/panic.go index e6b34514f1..6e81d0a9a7 100644 --- a/go/pkg/prometheus/metrics/panic.go +++ b/go/pkg/prometheus/metrics/panic.go @@ -13,7 +13,7 @@ import ( var ( PanicsTotal = promauto.NewCounterVec(prometheus.CounterOpts{ Namespace: "unkey", - Subsystem: "handler", + Subsystem: "internal", Name: "panics_total", Help: "Counter to track panics across http handlers", }, []string{"caller", "path"}) diff --git a/go/pkg/prometheus/metrics/ratelimit.go b/go/pkg/prometheus/metrics/ratelimit.go index 2b751ae2a2..4bb21727ce 100644 --- a/go/pkg/prometheus/metrics/ratelimit.go +++ b/go/pkg/prometheus/metrics/ratelimit.go @@ -19,6 +19,7 @@ var ( // metrics.RatelimitBuckets.Set(float64(activeBuckets)) RatelimitBuckets = promauto.NewGauge( prometheus.GaugeOpts{ + Namespace: "unkey", Subsystem: "ratelimit", Name: "buckets", Help: "Current number of active rate-limit buckets.", @@ -33,6 +34,7 @@ var ( // metrics.RatelimitWindows.Set(float64(activeWindows)) RatelimitWindows = promauto.NewGauge( prometheus.GaugeOpts{ + Namespace: "unkey", Subsystem: "ratelimit", Name: "windows", Help: "Current number of rate-limit windows.", @@ -47,6 +49,7 @@ var ( // metrics.RatelimitBucketsCreated.Inc() RatelimitBucketsCreated = promauto.NewCounter( prometheus.CounterOpts{ + Namespace: "unkey", Subsystem: "ratelimit", Name: "buckets_created_total", Help: "Total number of rate-limit buckets created.", @@ -61,6 +64,7 @@ var ( // metrics.RatelimitBucketsEvicted.Inc() RatelimitBucketsEvicted = promauto.NewCounter( prometheus.CounterOpts{ + Namespace: "unkey", Subsystem: "ratelimit", Name: "buckets_evicted_total", Help: "Total number of rate-limit buckets evicted.", @@ -75,6 +79,7 @@ var ( // metrics.RatelimitWindowsCreated.Inc() RatelimitWindowsCreated = promauto.NewCounter( prometheus.CounterOpts{ + Namespace: "unkey", Subsystem: "ratelimit", Name: "windows_created_total", Help: "Total number of rate-limit time windows created.", @@ -89,6 +94,7 @@ var ( // metrics.RatelimitWindowsEvicted.Inc() RatelimitWindowsEvicted = promauto.NewCounter( prometheus.CounterOpts{ + Namespace: "unkey", Subsystem: "ratelimit", Name: "windows_evicted_total", Help: "Total number of rate-limit time windows evicted.", @@ -105,6 +111,7 @@ var ( // metrics.RatelimitDecisions.WithLabelValues("origin", "denied").Inc() RatelimitDecision = promauto.NewCounterVec( prometheus.CounterOpts{ + Namespace: "unkey", Subsystem: "ratelimit", Name: "decisions_total", Help: "Total number of rate-limit decisions.", @@ -120,6 +127,7 @@ var ( // metrics.RatelimitRefreshFromOrigin.Inc() RatelimitRefreshFromOrigin = promauto.NewCounter( prometheus.CounterOpts{ + Namespace: "unkey", Subsystem: "ratelimit", Name: "refresh_from_origin_total", Help: "Total number of refreshes from an origin.", @@ -137,6 +145,7 @@ var ( // defer timer.ObserveDuration() RatelimitOriginSyncLatency = promauto.NewHistogram( prometheus.HistogramOpts{ + Namespace: "unkey", Subsystem: "ratelimit", Name: "origin_sync_latency_seconds", Help: "Histogram of origin sync latencies in seconds.", @@ -144,4 +153,19 @@ var ( ConstLabels: constLabels, }, ) + + // RatelimitRefreshFromOriginErrorsTotal tracks the total number of errors when refreshing + // rate-limits from an origin. Use this counter to monitor origin sync reliability. + // + // Example usage: + // metrics.RatelimitRefreshFromOriginErrorsTotal.Inc() + RatelimitRefreshFromOriginErrorsTotal = promauto.NewCounter( + prometheus.CounterOpts{ + Namespace: "unkey", + Subsystem: "ratelimit", + Name: "refresh_from_origin_errors_total", + Help: "Total number of errors when refreshing from an origin.", + ConstLabels: constLabels, + }, + ) )