Skip to content
Closed
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
24ec69c
refactor(router): improve latency metrics handling in engine loader h…
alepane21 Feb 20, 2026
befb1cb
Merge remote-tracking branch 'origin/main' into ale/eng-8915-router-r…
alepane21 Feb 23, 2026
38d3d54
fix: ignore different response order
alepane21 Feb 23, 2026
4e050ea
Merge branch 'main' into ale/eng-8915-router-router_http_request_dura…
alepane21 Feb 23, 2026
1dca661
Merge branch 'main' into ale/eng-8915-router-router_http_request_dura…
alepane21 Feb 23, 2026
caea193
chore: improve readibility
alepane21 Feb 24, 2026
f0cc218
Merge remote-tracking branch 'origin/ale/eng-8915-router-router_http_…
alepane21 Feb 24, 2026
8f4baa8
Merge branch 'main' into ale/eng-8915-router-router_http_request_dura…
alepane21 Feb 24, 2026
3928ae1
chore: replace Eq To Jsoneq to avoid flakiness
alepane21 Feb 24, 2026
6a30f1f
Merge remote-tracking branch 'origin/main' into ale/eng-8915-router-r…
alepane21 Feb 26, 2026
f162013
Merge remote-tracking branch 'origin/main' into ale/eng-8915-router-r…
alepane21 Feb 26, 2026
08efda8
fix: make test less flaky
alepane21 Feb 26, 2026
6823e5e
Merge remote-tracking branch 'origin/main' into ale/eng-8915-router-r…
alepane21 Mar 2, 2026
fc2df50
chore: undo changes not needed anymore
alepane21 Mar 2, 2026
3c901ce
chore: remove start var calculation
alepane21 Mar 2, 2026
3cd8de9
Merge branch 'main' into ale/eng-8915-router-router_http_request_dura…
alepane21 Mar 2, 2026
dc7fa25
fix: remove unused field
alepane21 Mar 2, 2026
c59e8d1
Merge remote-tracking branch 'origin/ale/eng-8915-router-router_http_…
alepane21 Mar 2, 2026
3965823
chore: remove unused context type
alepane21 Mar 2, 2026
317cb0b
Merge branch 'main' into ale/eng-8915-router-router_http_request_dura…
alepane21 Mar 3, 2026
d4266d8
Merge branch 'main' into ale/eng-8915-router-router_http_request_dura…
alepane21 Mar 3, 2026
c413eb3
Merge branch 'main' into ale/eng-8915-router-router_http_request_dura…
alepane21 Mar 12, 2026
d4f717d
Merge branch 'main' into ale/eng-8915-router-router_http_request_dura…
alepane21 Mar 12, 2026
b874625
Merge branch 'main' into ale/eng-8915-router-router_http_request_dura…
alepane21 Apr 3, 2026
60ab2ba
feat: use latency coming up from engine
alepane21 Apr 3, 2026
3fa19c4
chore: remove local engine
alepane21 Apr 3, 2026
f185aab
fix: remove wrong file
alepane21 Apr 3, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions router-tests/prometheus_parallel_subgraph_metrics_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
package integration

import (
"testing"
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/stretchr/testify/require"
"github.com/wundergraph/cosmo/router-tests/testenv"
"go.opentelemetry.io/otel/sdk/metric"
)

func TestPrometheusParallelSubgraphRequestDurationMetrics(t *testing.T) {
t.Parallel()

const productsDelay = 1200 * time.Millisecond

metricReader := metric.NewManualReader()
promRegistry := prometheus.NewRegistry()

testenv.Run(t, &testenv.Config{
MetricReader: metricReader,
PrometheusRegistry: promRegistry,
Subgraphs: testenv.SubgraphsConfig{
Products: testenv.SubgraphConfig{
Delay: productsDelay,
},
},
}, func(t *testing.T, xEnv *testenv.Environment) {
res := xEnv.MakeGraphQLRequestOK(testenv.GraphQLRequest{
Query: `query TestParallel2 {
employee(id: 1) {
id
}
productTypes {
... on Consultancy {
name
}
... on Cosmo {
name
}
}
}`,
})

require.Contains(t, res.Body, `"employee":{"id":1}`)
require.Contains(t, res.Body, `"productTypes"`)

metricFamily, err := promRegistry.Gather()
require.NoError(t, err)

requestDuration := findMetricFamilyByName(metricFamily, "router_http_request_duration_milliseconds")
require.NotNil(t, requestDuration)

employeesDurationMetrics := findMetricsByLabel(requestDuration, "wg_subgraph_name", "employees")
require.Len(t, employeesDurationMetrics, 1)

productsDurationMetrics := findMetricsByLabel(requestDuration, "wg_subgraph_name", "products")
require.Len(t, productsDurationMetrics, 1)

employeesHistogram := employeesDurationMetrics[0].GetHistogram()
productsHistogram := productsDurationMetrics[0].GetHistogram()
require.NotNil(t, employeesHistogram)
require.NotNil(t, productsHistogram)
require.EqualValues(t, 1, employeesHistogram.GetSampleCount())
require.EqualValues(t, 1, productsHistogram.GetSampleCount())

employeesDurationMs := employeesHistogram.GetSampleSum()
productsDurationMs := productsHistogram.GetSampleSum()

require.GreaterOrEqual(t, productsDurationMs, float64(productsDelay.Milliseconds()))
require.Less(t, employeesDurationMs, float64(productsDelay.Milliseconds()))
})
}
27 changes: 7 additions & 20 deletions router/core/engine_loader_hooks.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,6 @@ type engineLoaderHooks struct {
headerPropagation *HeaderPropagation
}

type engineLoaderHooksRequestContext struct {
startTime time.Time
}

func NewEngineRequestHooks(
metricStore metric.Store,
logger *requestlogger.SubgraphAccessLogger,
Expand Down Expand Up @@ -97,8 +93,6 @@ func (f *engineLoaderHooks) OnLoad(ctx context.Context, ds resolve.DataSourceInf
return ctx
}

start := time.Now()

ctx = context.WithValue(ctx, rcontext.CurrentSubgraphContextKey{}, ds.Name)

duration := atomic.Int64{}
Expand All @@ -116,9 +110,7 @@ func (f *engineLoaderHooks) OnLoad(ctx context.Context, ds resolve.DataSourceInf
}...),
)

return context.WithValue(ctx, rcontext.EngineLoaderHooksContextKey, &engineLoaderHooksRequestContext{
startTime: start,
})
return ctx
}

func (f *engineLoaderHooks) OnFinished(ctx context.Context, ds resolve.DataSourceInfo, responseInfo *resolve.ResponseInfo) {
Expand Down Expand Up @@ -148,13 +140,6 @@ func (f *engineLoaderHooks) OnFinished(ctx context.Context, ds resolve.DataSourc
return
}

hookCtx, ok := ctx.Value(rcontext.EngineLoaderHooksContextKey).(*engineLoaderHooksRequestContext)
if !ok {
return
}

latency := time.Since(hookCtx.startTime)

span := trace.SpanFromContext(ctx)
defer span.End()

Expand All @@ -175,9 +160,11 @@ func (f *engineLoaderHooks) OnFinished(ctx context.Context, ds resolve.DataSourc
exprCtx.Subgraph.Name = ds.Name
exprCtx.Subgraph.Request.Error = WrapExprError(responseInfo.Err)

var fetchLatency time.Duration
if value := ctx.Value(rcontext.FetchTimingKey); value != nil {
if fetchTiming, ok := value.(*atomic.Int64); ok {
exprCtx.Subgraph.Request.ClientTrace.FetchDuration = time.Duration(fetchTiming.Load())
fetchLatency = time.Duration(fetchTiming.Load())
exprCtx.Subgraph.Request.ClientTrace.FetchDuration = fetchLatency

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This measures the low level round trip, whereas the current measurement should measure this plus the engine hook logic that would add to the request duration.

However more importantly, would it work with GRPC data sources?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch! It would not work with gRPC data sources.

}
}

Expand Down Expand Up @@ -228,7 +215,7 @@ func (f *engineLoaderHooks) OnFinished(ctx context.Context, ds resolve.DataSourc
zap.String("subgraph_name", ds.Name),
zap.String("subgraph_id", ds.ID),
zap.Int("status", responseInfo.StatusCode),
zap.Duration("latency", latency),
zap.Duration("latency", fetchLatency),
}
path := ds.Name
if responseInfo.Request != nil {
Expand Down Expand Up @@ -299,10 +286,10 @@ func (f *engineLoaderHooks) OnFinished(ctx context.Context, ds resolve.DataSourc

attrOpt := otelmetric.WithAttributeSet(attribute.NewSet(metricAttrs...))
f.metricStore.MeasureRequestCount(ctx, metricSliceAttrs, attrOpt)
f.metricStore.MeasureLatency(ctx, latency, metricSliceAttrs, attrOpt)
f.metricStore.MeasureLatency(ctx, fetchLatency, metricSliceAttrs, attrOpt)
} else {
f.metricStore.MeasureRequestCount(ctx, reqContext.telemetry.metricSliceAttrs, metricAddOpt)
f.metricStore.MeasureLatency(ctx, latency, reqContext.telemetry.metricSliceAttrs, metricAddOpt)
f.metricStore.MeasureLatency(ctx, fetchLatency, reqContext.telemetry.metricSliceAttrs, metricAddOpt)
}

span.SetAttributes(traceAttrs...)
Expand Down
Loading