-
Notifications
You must be signed in to change notification settings - Fork 233
fix: improve prometheus schema usage collection and implement sampling #2323
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
1596e6f
76ee54a
ba0ab5c
4837cbd
423dac8
0cba4f0
807ddbd
b3583e7
e138c48
bc0e073
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -2,6 +2,7 @@ package core | |||||||||
|
|
||||||||||
| import ( | ||||||||||
| "context" | ||||||||||
| "math/rand/v2" | ||||||||||
| "slices" | ||||||||||
| "time" | ||||||||||
|
|
||||||||||
|
|
@@ -38,8 +39,14 @@ type OperationMetrics struct { | |||||||||
| logger *zap.Logger | ||||||||||
| trackUsageInfo bool | ||||||||||
|
|
||||||||||
| promSchemaUsageEnabled bool | ||||||||||
| promSchemaUsageIncludeOperationSha bool | ||||||||||
| promSchemaUsageEnabled bool | ||||||||||
| promSchemaUsageIncludeOpSha bool | ||||||||||
| promSchemaUsageSampleRate float64 | ||||||||||
| } | ||||||||||
|
|
||||||||||
| type usageKey struct { | ||||||||||
| fieldName string | ||||||||||
| parentType string | ||||||||||
| } | ||||||||||
|
|
||||||||||
| func (m *OperationMetrics) Finish(reqContext *requestContext, statusCode int, responseSize int, exportSynchronous bool) { | ||||||||||
|
|
@@ -82,28 +89,46 @@ func (m *OperationMetrics) Finish(reqContext *requestContext, statusCode int, re | |||||||||
| } | ||||||||||
|
|
||||||||||
| // Prometheus usage metrics, disabled by default | ||||||||||
| if m.promSchemaUsageEnabled && reqContext.operation != nil && !reqContext.operation.executionOptions.SkipLoader { | ||||||||||
| if m.promSchemaUsageEnabled && reqContext.operation != nil { | ||||||||||
|
|
||||||||||
| if !m.shouldSampleOperation() { | ||||||||||
| return | ||||||||||
| } | ||||||||||
|
|
||||||||||
| opAttrs := []attribute.KeyValue{ | ||||||||||
| rotel.WgOperationName.String(reqContext.operation.name), | ||||||||||
| rotel.WgOperationType.String(reqContext.operation.opType), | ||||||||||
| } | ||||||||||
|
|
||||||||||
| if m.promSchemaUsageIncludeOperationSha && reqContext.operation.sha256Hash != "" { | ||||||||||
| // Include operation SHA256 if enabled | ||||||||||
| if m.promSchemaUsageIncludeOpSha && reqContext.operation.sha256Hash != "" { | ||||||||||
| opAttrs = append(opAttrs, rotel.WgOperationSha256.String(reqContext.operation.sha256Hash)) | ||||||||||
| } | ||||||||||
|
|
||||||||||
| usageCounts := make(map[usageKey]int) | ||||||||||
|
|
||||||||||
| for _, field := range reqContext.operation.typeFieldUsageInfo { | ||||||||||
| if field.ExactParentTypeName == "" { | ||||||||||
| if field.ExactParentTypeName == "" || len(field.Path) == 0 { | ||||||||||
| continue | ||||||||||
| } | ||||||||||
|
|
||||||||||
| key := usageKey{ | ||||||||||
| fieldName: field.Path[len(field.Path)-1], | ||||||||||
| parentType: field.ExactParentTypeName, | ||||||||||
| } | ||||||||||
|
|
||||||||||
| usageCounts[key]++ | ||||||||||
| } | ||||||||||
|
|
||||||||||
| for key, count := range usageCounts { | ||||||||||
| fieldAttrs := []attribute.KeyValue{ | ||||||||||
| rotel.WgGraphQLFieldName.String(field.Path[len(field.Path)-1]), | ||||||||||
| rotel.WgGraphQLParentType.String(field.ExactParentTypeName), | ||||||||||
| rotel.WgGraphQLFieldName.String(key.fieldName), | ||||||||||
| rotel.WgGraphQLParentType.String(key.parentType), | ||||||||||
| } | ||||||||||
|
|
||||||||||
| rm.MeasureSchemaFieldUsage(ctx, 1, []attribute.KeyValue{}, otelmetric.WithAttributeSet(attribute.NewSet(slices.Concat(opAttrs, fieldAttrs)...))) | ||||||||||
| rm.MeasureSchemaFieldUsage(ctx, int64(count), []attribute.KeyValue{}, otelmetric.WithAttributeSet(attribute.NewSet(slices.Concat(opAttrs, fieldAttrs)...))) | ||||||||||
|
StarpTech marked this conversation as resolved.
|
||||||||||
| } | ||||||||||
|
|
||||||||||
| } | ||||||||||
| } | ||||||||||
|
|
||||||||||
|
|
@@ -116,8 +141,9 @@ type OperationMetricsOptions struct { | |||||||||
| Logger *zap.Logger | ||||||||||
| TrackUsageInfo bool | ||||||||||
|
|
||||||||||
| PrometheusSchemaUsageEnabled bool | ||||||||||
| PrometheusSchemaUsageIncludeSha bool | ||||||||||
| PrometheusSchemaUsageEnabled bool | ||||||||||
| PrometheusSchemaUsageIncludeOpSha bool | ||||||||||
| PrometheusSchemaUsageSampleRate float64 | ||||||||||
| } | ||||||||||
|
|
||||||||||
| // newOperationMetrics creates a new OperationMetrics struct and starts the operation metrics. | ||||||||||
|
|
@@ -135,7 +161,30 @@ func newOperationMetrics(opts OperationMetricsOptions) *OperationMetrics { | |||||||||
| logger: opts.Logger, | ||||||||||
| trackUsageInfo: opts.TrackUsageInfo, | ||||||||||
|
|
||||||||||
| promSchemaUsageEnabled: opts.PrometheusSchemaUsageEnabled, | ||||||||||
| promSchemaUsageIncludeOperationSha: opts.PrometheusSchemaUsageIncludeSha, | ||||||||||
| promSchemaUsageEnabled: opts.PrometheusSchemaUsageEnabled, | ||||||||||
| promSchemaUsageIncludeOpSha: opts.PrometheusSchemaUsageIncludeOpSha, | ||||||||||
| promSchemaUsageSampleRate: opts.PrometheusSchemaUsageSampleRate, | ||||||||||
| } | ||||||||||
| } | ||||||||||
|
|
||||||||||
| // shouldSampleOperation determines if a request should be sampled for schema field usage metrics. | ||||||||||
| // Uses probabilistic random sampling to ensure uniform distribution across all operations. | ||||||||||
| // | ||||||||||
| // This ensures: | ||||||||||
| // - All operations get statistical coverage (~X% of requests per operation) | ||||||||||
| // - Uniform distribution regardless of request ID format | ||||||||||
| // - Supports ANY sample rate (0.0 to 1.0), including arbitrary values like 0.8, 0.156, etc. | ||||||||||
| // | ||||||||||
| // Note: Uses non-deterministic random sampling rather than hash-based sampling because | ||||||||||
| // sequential request IDs produce clustered hash values that break deterministic sampling. | ||||||||||
|
Comment on lines
+178
to
+179
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: Kind of contradictory statement. "Uses non-deterministic" conflicts with "clustered hash values that break deterministic sampling".
Suggested change
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Haven't seen this. Thanks for the suggestions. |
||||||||||
| func (m *OperationMetrics) shouldSampleOperation() bool { | ||||||||||
| if m.promSchemaUsageSampleRate >= 1.0 { | ||||||||||
| return true | ||||||||||
| } | ||||||||||
| if m.promSchemaUsageSampleRate <= 0.0 { | ||||||||||
| return false | ||||||||||
| } | ||||||||||
|
|
||||||||||
| // Probabilistic sampling: simple, reliable, and guaranteed uniform distribution | ||||||||||
| return rand.Float64() < m.promSchemaUsageSampleRate | ||||||||||
| } | ||||||||||
Uh oh!
There was an error while loading. Please reload this page.