diff --git a/router-tests/prometheus_improved_test.go b/router-tests/prometheus_improved_test.go index ad778de323..cbd3fad8ff 100644 --- a/router-tests/prometheus_improved_test.go +++ b/router-tests/prometheus_improved_test.go @@ -1,10 +1,11 @@ package integration import ( - rmetric "github.com/wundergraph/cosmo/router/pkg/metric" "regexp" "testing" + rmetric "github.com/wundergraph/cosmo/router/pkg/metric" + "github.com/prometheus/client_golang/prometheus" io_prometheus_client "github.com/prometheus/client_model/go" "github.com/stretchr/testify/assert" @@ -36,7 +37,8 @@ func TestPrometheusSchemaUsage(t *testing.T) { PrometheusRegistry: promRegistry, MetricOptions: testenv.MetricOptions{ PrometheusSchemaFieldUsage: testenv.PrometheusSchemaFieldUsage{ - Enabled: true, + Enabled: true, + SampleRate: 1.0, }, }, }, func(t *testing.T, xEnv *testenv.Environment) { @@ -128,7 +130,8 @@ query myQuery { PrometheusRegistry: promRegistry, MetricOptions: testenv.MetricOptions{ PrometheusSchemaFieldUsage: testenv.PrometheusSchemaFieldUsage{ - Enabled: true, + Enabled: true, + SampleRate: 1.0, }, }, }, func(t *testing.T, xEnv *testenv.Environment) { @@ -203,6 +206,7 @@ query myQuery { PrometheusSchemaFieldUsage: testenv.PrometheusSchemaFieldUsage{ Enabled: true, IncludeOperationSha: false, + SampleRate: 1.0, }, }, }, func(t *testing.T, xEnv *testenv.Environment) { @@ -240,6 +244,7 @@ query myQuery { PrometheusSchemaFieldUsage: testenv.PrometheusSchemaFieldUsage{ Enabled: true, IncludeOperationSha: true, + SampleRate: 1.0, }, }, }, func(t *testing.T, xEnv *testenv.Environment) { @@ -283,6 +288,7 @@ query myQuery { PrometheusSchemaFieldUsage: testenv.PrometheusSchemaFieldUsage{ Enabled: true, IncludeOperationSha: false, + SampleRate: 1.0, }, }, }, func(t *testing.T, xEnv *testenv.Environment) { @@ -308,6 +314,142 @@ query myQuery { } }) }) + + t.Run("sampling reduces tracked requests", func(t *testing.T) { + t.Parallel() + + metricReader := metric.NewManualReader() + promRegistry := prometheus.NewRegistry() + + testenv.Run(t, &testenv.Config{ + MetricReader: metricReader, + PrometheusRegistry: promRegistry, + MetricOptions: testenv.MetricOptions{ + PrometheusSchemaFieldUsage: testenv.PrometheusSchemaFieldUsage{ + Enabled: true, + SampleRate: 0.1, // 10% sampling + }, + }, + }, func(t *testing.T, xEnv *testenv.Environment) { + // Make 100 requests + for i := 0; i < 100; i++ { + res := xEnv.MakeGraphQLRequestOK(testenv.GraphQLRequest{ + Query: `query myQuery { employee(id: 1) { id } }`, + }) + require.JSONEq(t, `{"data":{"employee":{"id":1}}}`, res.Body) + } + + mf, err := promRegistry.Gather() + require.NoError(t, err) + + schemaUsage := findMetricFamilyByName(mf, SchemaFieldUsageMetricName) + assert.NotNil(t, schemaUsage) + + schemaUsageMetrics := schemaUsage.GetMetric() + + require.Greater(t, len(schemaUsageMetrics), 0, "At least 1 request should be sampled") + + // With 10% sampling and 100 requests, each sampled request increments two field counters (`employee` and `id`). + // 100% sampling would produce 200 total field counts (100 requests * 2 fields), so a reduced total confirms sampling worked. + totalFieldCounts := 0.0 + for _, m := range schemaUsageMetrics { + counter := m.GetCounter() + require.NotNil(t, counter) + totalFieldCounts += counter.GetValue() + } + + require.Greater(t, totalFieldCounts, 0.0, "At least one sampled field is expected with a 10% sample rate") + require.Less(t, totalFieldCounts, 200.0, "Sampling should record fewer than 100% of requests (200 total field counts)") + + // Verify that the sampled metrics have correct structure + for _, m := range schemaUsageMetrics { + assertLabelValue(t, m.Label, otel.WgOperationName, "myQuery") + assertLabelValue(t, m.Label, otel.WgOperationType, "query") + } + }) + }) + + t.Run("100% sample rate tracks all requests", func(t *testing.T) { + t.Parallel() + + metricReader := metric.NewManualReader() + promRegistry := prometheus.NewRegistry() + + testenv.Run(t, &testenv.Config{ + MetricReader: metricReader, + PrometheusRegistry: promRegistry, + MetricOptions: testenv.MetricOptions{ + PrometheusSchemaFieldUsage: testenv.PrometheusSchemaFieldUsage{ + Enabled: true, + SampleRate: 1.0, // 100% sampling (default) + }, + }, + }, func(t *testing.T, xEnv *testenv.Environment) { + // Make 10 requests + for range 10 { + res := xEnv.MakeGraphQLRequestOK(testenv.GraphQLRequest{ + Query: `query myQuery { employee(id: 1) { id } }`, + }) + require.JSONEq(t, `{"data":{"employee":{"id":1}}}`, res.Body) + } + + mf, err := promRegistry.Gather() + require.NoError(t, err) + + schemaUsage := findMetricFamilyByName(mf, SchemaFieldUsageMetricName) + assert.NotNil(t, schemaUsage) + + schemaUsageMetrics := schemaUsage.GetMetric() + + // With 100% sampling and 10 requests, we expect 2 metrics (employee, id) + // The counter values should be 10 for each field + require.Len(t, schemaUsageMetrics, 2) + + for _, metric := range schemaUsageMetrics { + assertLabelValue(t, metric.Label, otel.WgOperationName, "myQuery") + assertLabelValue(t, metric.Label, otel.WgOperationType, "query") + + // Each field should have been counted 10 times (once per request) + assert.InEpsilon(t, 10.0, *metric.Counter.Value, 0.0001) + } + }) + }) + + t.Run("0% sample rate tracks no requests", func(t *testing.T) { + t.Parallel() + + metricReader := metric.NewManualReader() + promRegistry := prometheus.NewRegistry() + + testenv.Run(t, &testenv.Config{ + MetricReader: metricReader, + PrometheusRegistry: promRegistry, + MetricOptions: testenv.MetricOptions{ + PrometheusSchemaFieldUsage: testenv.PrometheusSchemaFieldUsage{ + Enabled: true, + SampleRate: 0.0, // 0% sampling + }, + }, + }, func(t *testing.T, xEnv *testenv.Environment) { + // Make 10 requests + for range 10 { + res := xEnv.MakeGraphQLRequestOK(testenv.GraphQLRequest{ + Query: `query myQuery { employee(id: 1) { id } }`, + }) + require.JSONEq(t, `{"data":{"employee":{"id":1}}}`, res.Body) + } + + mf, err := promRegistry.Gather() + require.NoError(t, err) + + schemaUsage := findMetricFamilyByName(mf, SchemaFieldUsageMetricName) + + // With 0% sampling, no metrics should be recorded + if schemaUsage != nil { + require.Len(t, schemaUsage.GetMetric(), 0, "No metrics should be recorded with 0% sampling") + } + }) + }) } func assertLabelNotPresent(t *testing.T, labels []*io_prometheus_client.LabelPair, labelKey attribute.Key) { diff --git a/router-tests/testenv/testenv.go b/router-tests/testenv/testenv.go index a7b2b01ef8..6ca8532958 100644 --- a/router-tests/testenv/testenv.go +++ b/router-tests/testenv/testenv.go @@ -279,6 +279,7 @@ type MetricOptions struct { type PrometheusSchemaFieldUsage struct { Enabled bool IncludeOperationSha bool + SampleRate float64 } type Config struct { @@ -1516,6 +1517,7 @@ func configureRouter(listenerAddr string, testConfig *Config, routerConfig *node PromSchemaFieldUsage: rmetric.PrometheusSchemaFieldUsage{ Enabled: testConfig.MetricOptions.PrometheusSchemaFieldUsage.Enabled, IncludeOperationSha: testConfig.MetricOptions.PrometheusSchemaFieldUsage.IncludeOperationSha, + SampleRate: testConfig.MetricOptions.PrometheusSchemaFieldUsage.SampleRate, }, } } diff --git a/router/core/graph_server.go b/router/core/graph_server.go index 4a16e38ad7..823a7629c3 100644 --- a/router/core/graph_server.go +++ b/router/core/graph_server.go @@ -919,8 +919,9 @@ func (s *graphServer) buildGraphMux( routerConfigVersion: opts.RouterConfigVersion, logger: s.logger, - promSchemaUsageEnabled: s.metricConfig.Prometheus.PromSchemaFieldUsage.Enabled, - promSchemaUsageIncludeOperationSha: s.metricConfig.Prometheus.PromSchemaFieldUsage.IncludeOperationSha, + promSchemaUsageEnabled: s.metricConfig.Prometheus.PromSchemaFieldUsage.Enabled, + promSchemaUsageIncludeOpSha: s.metricConfig.Prometheus.PromSchemaFieldUsage.IncludeOperationSha, + promSchemaUsageSampleRate: s.metricConfig.Prometheus.PromSchemaFieldUsage.SampleRate, }) baseLogFields := []zapcore.Field{ diff --git a/router/core/operation_metrics.go b/router/core/operation_metrics.go index 0f4b658873..d4615cf303 100644 --- a/router/core/operation_metrics.go +++ b/router/core/operation_metrics.go @@ -2,6 +2,7 @@ package core import ( "context" + "math/rand/v2" "slices" "time" @@ -38,8 +39,14 @@ type OperationMetrics struct { logger *zap.Logger trackUsageInfo bool - promSchemaUsageEnabled bool - promSchemaUsageIncludeOperationSha bool + promSchemaUsageEnabled bool + promSchemaUsageIncludeOpSha bool + promSchemaUsageSampleRate float64 +} + +type usageKey struct { + fieldName string + parentType string } func (m *OperationMetrics) Finish(reqContext *requestContext, statusCode int, responseSize int, exportSynchronous bool) { @@ -82,28 +89,46 @@ func (m *OperationMetrics) Finish(reqContext *requestContext, statusCode int, re } // Prometheus usage metrics, disabled by default - if m.promSchemaUsageEnabled && reqContext.operation != nil && !reqContext.operation.executionOptions.SkipLoader { + if m.promSchemaUsageEnabled && reqContext.operation != nil { + + if !m.shouldSampleOperation() { + return + } + opAttrs := []attribute.KeyValue{ rotel.WgOperationName.String(reqContext.operation.name), rotel.WgOperationType.String(reqContext.operation.opType), } - if m.promSchemaUsageIncludeOperationSha && reqContext.operation.sha256Hash != "" { + // Include operation SHA256 if enabled + if m.promSchemaUsageIncludeOpSha && reqContext.operation.sha256Hash != "" { opAttrs = append(opAttrs, rotel.WgOperationSha256.String(reqContext.operation.sha256Hash)) } + usageCounts := make(map[usageKey]int) + for _, field := range reqContext.operation.typeFieldUsageInfo { - if field.ExactParentTypeName == "" { + if field.ExactParentTypeName == "" || len(field.Path) == 0 { continue } + key := usageKey{ + fieldName: field.Path[len(field.Path)-1], + parentType: field.ExactParentTypeName, + } + + usageCounts[key]++ + } + + for key, count := range usageCounts { fieldAttrs := []attribute.KeyValue{ - rotel.WgGraphQLFieldName.String(field.Path[len(field.Path)-1]), - rotel.WgGraphQLParentType.String(field.ExactParentTypeName), + rotel.WgGraphQLFieldName.String(key.fieldName), + rotel.WgGraphQLParentType.String(key.parentType), } - rm.MeasureSchemaFieldUsage(ctx, 1, []attribute.KeyValue{}, otelmetric.WithAttributeSet(attribute.NewSet(slices.Concat(opAttrs, fieldAttrs)...))) + rm.MeasureSchemaFieldUsage(ctx, int64(count), []attribute.KeyValue{}, otelmetric.WithAttributeSet(attribute.NewSet(slices.Concat(opAttrs, fieldAttrs)...))) } + } } @@ -116,8 +141,9 @@ type OperationMetricsOptions struct { Logger *zap.Logger TrackUsageInfo bool - PrometheusSchemaUsageEnabled bool - PrometheusSchemaUsageIncludeSha bool + PrometheusSchemaUsageEnabled bool + PrometheusSchemaUsageIncludeOpSha bool + PrometheusSchemaUsageSampleRate float64 } // newOperationMetrics creates a new OperationMetrics struct and starts the operation metrics. @@ -135,7 +161,30 @@ func newOperationMetrics(opts OperationMetricsOptions) *OperationMetrics { logger: opts.Logger, trackUsageInfo: opts.TrackUsageInfo, - promSchemaUsageEnabled: opts.PrometheusSchemaUsageEnabled, - promSchemaUsageIncludeOperationSha: opts.PrometheusSchemaUsageIncludeSha, + promSchemaUsageEnabled: opts.PrometheusSchemaUsageEnabled, + promSchemaUsageIncludeOpSha: opts.PrometheusSchemaUsageIncludeOpSha, + promSchemaUsageSampleRate: opts.PrometheusSchemaUsageSampleRate, + } +} + +// shouldSampleOperation determines if a request should be sampled for schema field usage metrics. +// Uses probabilistic random sampling to ensure uniform distribution across all operations. +// +// This ensures: +// - All operations get statistical coverage (~X% of requests per operation) +// - Uniform distribution regardless of request ID format +// - Supports ANY sample rate (0.0 to 1.0), including arbitrary values like 0.8, 0.156, etc. +// +// Note: Uses non-deterministic random sampling rather than hash-based sampling because +// sequential request IDs produce clustered hash values that break deterministic sampling. +func (m *OperationMetrics) shouldSampleOperation() bool { + if m.promSchemaUsageSampleRate >= 1.0 { + return true } + if m.promSchemaUsageSampleRate <= 0.0 { + return false + } + + // Probabilistic sampling: simple, reliable, and guaranteed uniform distribution + return rand.Float64() < m.promSchemaUsageSampleRate } diff --git a/router/core/router.go b/router/core/router.go index fd07b166cc..94d99a10b9 100644 --- a/router/core/router.go +++ b/router/core/router.go @@ -2312,6 +2312,7 @@ func MetricConfigFromTelemetry(cfg *config.Telemetry) *rmetric.Config { PromSchemaFieldUsage: rmetric.PrometheusSchemaFieldUsage{ Enabled: cfg.Metrics.Prometheus.SchemaFieldUsage.Enabled, IncludeOperationSha: cfg.Metrics.Prometheus.SchemaFieldUsage.IncludeOperationSha, + SampleRate: cfg.Metrics.Prometheus.SchemaFieldUsage.SampleRate, }, }, } diff --git a/router/core/router_metrics.go b/router/core/router_metrics.go index 6e2d279471..4021eea2a7 100644 --- a/router/core/router_metrics.go +++ b/router/core/router_metrics.go @@ -27,8 +27,9 @@ type routerMetrics struct { logger *zap.Logger exportEnabled bool - promSchemaUsageEnabled bool - promSchemaUsageIncludeOperationSha bool + promSchemaUsageEnabled bool + promSchemaUsageIncludeOpSha bool + promSchemaUsageSampleRate float64 } type routerMetricsConfig struct { @@ -38,8 +39,9 @@ type routerMetricsConfig struct { logger *zap.Logger exportEnabled bool - promSchemaUsageEnabled bool - promSchemaUsageIncludeOperationSha bool + promSchemaUsageEnabled bool + promSchemaUsageIncludeOpSha bool + promSchemaUsageSampleRate float64 } func NewRouterMetrics(cfg *routerMetricsConfig) RouterMetrics { @@ -50,8 +52,9 @@ func NewRouterMetrics(cfg *routerMetricsConfig) RouterMetrics { logger: cfg.logger, exportEnabled: cfg.exportEnabled, - promSchemaUsageEnabled: cfg.promSchemaUsageEnabled, - promSchemaUsageIncludeOperationSha: cfg.promSchemaUsageIncludeOperationSha, + promSchemaUsageEnabled: cfg.promSchemaUsageEnabled, + promSchemaUsageIncludeOpSha: cfg.promSchemaUsageIncludeOpSha, + promSchemaUsageSampleRate: cfg.promSchemaUsageSampleRate, } } @@ -68,8 +71,9 @@ func (m *routerMetrics) StartOperation(logger *zap.Logger, requestContentLength InFlightAddOption: inFlightAddOption, SliceAttributes: sliceAttr, - PrometheusSchemaUsageEnabled: m.promSchemaUsageEnabled, - PrometheusSchemaUsageIncludeSha: m.promSchemaUsageIncludeOperationSha, + PrometheusSchemaUsageEnabled: m.promSchemaUsageEnabled, + PrometheusSchemaUsageIncludeOpSha: m.promSchemaUsageIncludeOpSha, + PrometheusSchemaUsageSampleRate: m.promSchemaUsageSampleRate, }) return metrics } diff --git a/router/pkg/config/config.go b/router/pkg/config/config.go index caf7738266..3900c0d904 100644 --- a/router/pkg/config/config.go +++ b/router/pkg/config/config.go @@ -113,8 +113,9 @@ type Prometheus struct { } type PrometheusSchemaFieldUsage struct { - Enabled bool `yaml:"enabled" envDefault:"false" env:"ENABLED"` - IncludeOperationSha bool `yaml:"include_operation_sha" envDefault:"false" env:"INCLUDE_OPERATION_SHA"` + Enabled bool `yaml:"enabled" envDefault:"false" env:"ENABLED"` + IncludeOperationSha bool `yaml:"include_operation_sha" envDefault:"false" env:"INCLUDE_OPERATION_SHA"` + SampleRate float64 `yaml:"sample_rate" envDefault:"1.0" env:"SAMPLE_RATE"` } type MetricsOTLPExporter struct { diff --git a/router/pkg/config/config.schema.json b/router/pkg/config/config.schema.json index 7874902f96..09f125170d 100644 --- a/router/pkg/config/config.schema.json +++ b/router/pkg/config/config.schema.json @@ -1245,6 +1245,13 @@ "type": "boolean", "default": false, "description": "Include the operation SHA256 in the metric labels, this can be an expensive operation. The default value is false." + }, + "sample_rate": { + "type": "number", + "default": 1.0, + "minimum": 0.0, + "maximum": 1.0, + "description": "Sample rate for schema field usage metrics (0.0 to 1.0). Uses probabilistic random sampling, ensuring all operations get ~X% statistical coverage with uniform distribution. Only recommended for large scale deployments. Default is 1.0 (100% sampling)." } } } diff --git a/router/pkg/config/config_test.go b/router/pkg/config/config_test.go index adb1e08c81..079a2b1d79 100644 --- a/router/pkg/config/config_test.go +++ b/router/pkg/config/config_test.go @@ -660,12 +660,14 @@ telemetry: schema_usage: enabled: true include_operation_sha: true + sample_rate: 0.5 # Supports any rate: 1.0, 0.8, 0.5, 0.1, 0.01, etc. `) c, err := LoadConfig([]string{f}) require.NoError(t, err) require.True(t, c.Config.Telemetry.Metrics.Prometheus.SchemaFieldUsage.Enabled) require.True(t, c.Config.Telemetry.Metrics.Prometheus.SchemaFieldUsage.IncludeOperationSha) + require.Equal(t, 0.5, c.Config.Telemetry.Metrics.Prometheus.SchemaFieldUsage.SampleRate) }) t.Run("from environment", func(t *testing.T) { @@ -677,16 +679,16 @@ version: "1" require.NoError(t, err) require.False(t, c.Config.Telemetry.Metrics.Prometheus.SchemaFieldUsage.Enabled) - require.False(t, c.Config.Telemetry.Metrics.Prometheus.SchemaFieldUsage.IncludeOperationSha) + require.Equal(t, 1.0, c.Config.Telemetry.Metrics.Prometheus.SchemaFieldUsage.SampleRate) t.Setenv("PROMETHEUS_SCHEMA_FIELD_USAGE_ENABLED", "true") - t.Setenv("PROMETHEUS_SCHEMA_FIELD_USAGE_INCLUDE_OPERATION_SHA", "true") + t.Setenv("PROMETHEUS_SCHEMA_FIELD_USAGE_SAMPLE_RATE", "0.25") c, err = LoadConfig([]string{f}) require.NoError(t, err) require.True(t, c.Config.Telemetry.Metrics.Prometheus.SchemaFieldUsage.Enabled) - require.True(t, c.Config.Telemetry.Metrics.Prometheus.SchemaFieldUsage.IncludeOperationSha) + require.Equal(t, 0.25, c.Config.Telemetry.Metrics.Prometheus.SchemaFieldUsage.SampleRate) }) } diff --git a/router/pkg/config/fixtures/full.yaml b/router/pkg/config/fixtures/full.yaml index ba24a18dbe..0513894381 100644 --- a/router/pkg/config/fixtures/full.yaml +++ b/router/pkg/config/fixtures/full.yaml @@ -176,7 +176,8 @@ telemetry: schema_usage: enabled: true - include_operation_sha: false + include_operation_sha: true + sample_rate: 1.0 # Supports any rate: 1.0, 0.8, 0.5, 0.1, 0.01, etc. cache_control_policy: enabled: true diff --git a/router/pkg/config/testdata/config_defaults.json b/router/pkg/config/testdata/config_defaults.json index 4a965af451..d2691d2391 100644 --- a/router/pkg/config/testdata/config_defaults.json +++ b/router/pkg/config/testdata/config_defaults.json @@ -62,7 +62,8 @@ "ExcludeScopeInfo": false, "SchemaFieldUsage": { "Enabled": false, - "IncludeOperationSha": false + "IncludeOperationSha": false, + "SampleRate": 1 } }, "CardinalityLimit": 2000 diff --git a/router/pkg/config/testdata/config_full.json b/router/pkg/config/testdata/config_full.json index f74e741d01..f1869a779b 100644 --- a/router/pkg/config/testdata/config_full.json +++ b/router/pkg/config/testdata/config_full.json @@ -92,7 +92,8 @@ "ExcludeScopeInfo": true, "SchemaFieldUsage": { "Enabled": true, - "IncludeOperationSha": false + "IncludeOperationSha": true, + "SampleRate": 1 } }, "CardinalityLimit": 2000 diff --git a/router/pkg/metric/config.go b/router/pkg/metric/config.go index 7a2c15e620..46da130889 100644 --- a/router/pkg/metric/config.go +++ b/router/pkg/metric/config.go @@ -41,6 +41,10 @@ type PrometheusConfig struct { type PrometheusSchemaFieldUsage struct { Enabled bool IncludeOperationSha bool + // SampleRate controls the percentage of requests to sample for schema field usage metrics (0.0 to 1.0). + // Uses probabilistic random sampling to ensure all operations get ~X% statistical coverage. + // Supports any rate: 1.0 (100%), 0.8 (80%), 0.5 (50%), 0.1 (10%), 0.01 (1%), etc. + SampleRate float64 } type OpenTelemetryExporter struct {