From d86e5cced7f5f56f4e65fca95d4709f8905a03f2 Mon Sep 17 00:00:00 2001 From: Milinda Dias Date: Mon, 1 Sep 2025 18:23:41 +0530 Subject: [PATCH 01/24] feat: retry settings config per subgraph --- router-tests/error_handling_test.go | 24 +- router-tests/panic_test.go | 12 +- router-tests/retry_test.go | 343 +++++++++++++- router-tests/structured_logging_test.go | 10 +- router-tests/telemetry/telemetry_test.go | 28 +- router/core/graph_server.go | 32 +- router/core/retry_builder.go | 71 +-- router/core/retry_builder_test.go | 343 +++++++------- router/core/router.go | 73 ++- router/core/router_config.go | 5 +- router/core/supervisor_instance.go | 10 +- router/core/transport.go | 14 +- router/internal/expr/retry_expression.go | 48 +- router/internal/expr/retry_expression_test.go | 32 +- router/internal/retrytransport/manager.go | 144 ++++++ .../retrytransport/retry_transport.go | 60 ++- .../retrytransport/retry_transport_test.go | 444 ++++++++---------- 17 files changed, 1046 insertions(+), 647 deletions(-) create mode 100644 router/internal/retrytransport/manager.go diff --git a/router-tests/error_handling_test.go b/router-tests/error_handling_test.go index 394b712492..5a7f69febe 100644 --- a/router-tests/error_handling_test.go +++ b/router-tests/error_handling_test.go @@ -1380,7 +1380,11 @@ func TestErrorPropagation(t *testing.T) { EnableSingleFlight: true, MaxConcurrentResolvers: 1, }), - core.WithSubgraphRetryOptions(false, "", 0, 0, 0, "", nil), + core.WithSubgraphRetryOptions(core.NewSubgraphRetryOptions(config.TrafficShapingRules{ + All: config.GlobalSubgraphRequestRule{ + BackoffJitterRetry: config.BackoffJitterRetry{Enabled: false}, + }, + })), }, }, func(t *testing.T, xEnv *testenv.Environment) { resp, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{ @@ -1412,7 +1416,11 @@ func TestErrorPropagation(t *testing.T) { EnableSingleFlight: true, MaxConcurrentResolvers: 1, }), - core.WithSubgraphRetryOptions(false, "", 0, 0, 0, "", nil), + core.WithSubgraphRetryOptions(core.NewSubgraphRetryOptions(config.TrafficShapingRules{ + All: config.GlobalSubgraphRequestRule{ + BackoffJitterRetry: config.BackoffJitterRetry{Enabled: false}, + }, + })), }, }, func(t *testing.T, xEnv *testenv.Environment) { resp, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{ @@ -1444,7 +1452,11 @@ func TestErrorPropagation(t *testing.T) { EnableSingleFlight: true, MaxConcurrentResolvers: 1, }), - core.WithSubgraphRetryOptions(false, "", 0, 0, 0, "", nil), + core.WithSubgraphRetryOptions(core.NewSubgraphRetryOptions(config.TrafficShapingRules{ + All: config.GlobalSubgraphRequestRule{ + BackoffJitterRetry: config.BackoffJitterRetry{Enabled: false}, + }, + })), }, }, func(t *testing.T, xEnv *testenv.Environment) { resp, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{ @@ -1476,7 +1488,11 @@ func TestErrorPropagation(t *testing.T) { EnableSingleFlight: true, MaxConcurrentResolvers: 1, }), - core.WithSubgraphRetryOptions(false, "", 0, 0, 0, "", nil), + core.WithSubgraphRetryOptions(core.NewSubgraphRetryOptions(config.TrafficShapingRules{ + All: config.GlobalSubgraphRequestRule{ + BackoffJitterRetry: config.BackoffJitterRetry{Enabled: false}, + }, + })), }, }, func(t *testing.T, xEnv *testenv.Environment) { resp, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{ diff --git a/router-tests/panic_test.go b/router-tests/panic_test.go index bb12614f44..eeca055597 100644 --- a/router-tests/panic_test.go +++ b/router-tests/panic_test.go @@ -48,7 +48,11 @@ func TestEnginePanic(t *testing.T) { EnableSingleFlight: true, MaxConcurrentResolvers: 1, }), - core.WithSubgraphRetryOptions(false, "", 0, 0, 0, "", nil), + core.WithSubgraphRetryOptions(core.NewSubgraphRetryOptions(config.TrafficShapingRules{ + All: config.GlobalSubgraphRequestRule{ + BackoffJitterRetry: config.BackoffJitterRetry{Enabled: false}, + }, + })), }, }, func(t *testing.T, xEnv *testenv.Environment) { res, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{ @@ -80,7 +84,11 @@ func TestEnginePanic(t *testing.T) { EnableSingleFlight: true, ParseKitPoolSize: 1, }), - core.WithSubgraphRetryOptions(false, "", 0, 0, 0, "", nil), + core.WithSubgraphRetryOptions(core.NewSubgraphRetryOptions(config.TrafficShapingRules{ + All: config.GlobalSubgraphRequestRule{ + BackoffJitterRetry: config.BackoffJitterRetry{Enabled: false}, + }, + })), }, }, func(t *testing.T, xEnv *testenv.Environment) { res, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{ diff --git a/router-tests/retry_test.go b/router-tests/retry_test.go index cf916e15bc..010b3ab1d8 100644 --- a/router-tests/retry_test.go +++ b/router-tests/retry_test.go @@ -1,15 +1,16 @@ package integration import ( - "github.com/stretchr/testify/require" - "github.com/wundergraph/cosmo/router-tests/testenv" - "github.com/wundergraph/cosmo/router/core" - "github.com/wundergraph/cosmo/router/pkg/config" "net/http" "strconv" "sync/atomic" "testing" "time" + + "github.com/stretchr/testify/require" + "github.com/wundergraph/cosmo/router-tests/testenv" + "github.com/wundergraph/cosmo/router/core" + "github.com/wundergraph/cosmo/router/pkg/config" ) func CreateRetryCounterFunc(counter *atomic.Int32, duration *atomic.Int64) func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { @@ -34,7 +35,19 @@ func TestRetry(t *testing.T) { maxRetryCount := 3 expression := "true" - options := core.WithSubgraphRetryOptions(true, "", maxRetryCount, 10*time.Second, 200*time.Millisecond, expression, retryCounterFunc) + opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{ + All: config.GlobalSubgraphRequestRule{ + BackoffJitterRetry: config.BackoffJitterRetry{ + Enabled: true, + MaxAttempts: maxRetryCount, + MaxDuration: 10 * time.Second, + Interval: 200 * time.Millisecond, + Expression: expression, + }, + }, + }) + opts.OnRetryFunc = retryCounterFunc + options := core.WithSubgraphRetryOptions(opts) testenv.Run(t, &testenv.Config{ NoRetryClient: true, @@ -76,7 +89,19 @@ func TestRetry(t *testing.T) { maxRetryCount := 3 expression := "false" - options := core.WithSubgraphRetryOptions(true, "", maxRetryCount, 10*time.Second, 200*time.Millisecond, expression, retryCounterFunc) + opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{ + All: config.GlobalSubgraphRequestRule{ + BackoffJitterRetry: config.BackoffJitterRetry{ + Enabled: true, + MaxAttempts: maxRetryCount, + MaxDuration: 10 * time.Second, + Interval: 200 * time.Millisecond, + Expression: expression, + }, + }, + }) + opts.OnRetryFunc = retryCounterFunc + options := core.WithSubgraphRetryOptions(opts) testenv.Run(t, &testenv.Config{ NoRetryClient: true, @@ -117,7 +142,19 @@ func TestRetry(t *testing.T) { maxRetryCount := 3 expression := "true" - options := core.WithSubgraphRetryOptions(true, "", maxRetryCount, 10*time.Second, 200*time.Millisecond, expression, retryCounterFunc) + opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{ + All: config.GlobalSubgraphRequestRule{ + BackoffJitterRetry: config.BackoffJitterRetry{ + Enabled: true, + MaxAttempts: maxRetryCount, + MaxDuration: 10 * time.Second, + Interval: 200 * time.Millisecond, + Expression: expression, + }, + }, + }) + opts.OnRetryFunc = retryCounterFunc + options := core.WithSubgraphRetryOptions(opts) testenv.Run(t, &testenv.Config{ NoRetryClient: true, @@ -159,7 +196,19 @@ func TestRetry(t *testing.T) { maxAttemptsBeforeServiceSucceeds := 2 expression := "true" - options := core.WithSubgraphRetryOptions(true, "", maxRetryCount, 10*time.Second, 200*time.Millisecond, expression, retryCounterFunc) + opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{ + All: config.GlobalSubgraphRequestRule{ + BackoffJitterRetry: config.BackoffJitterRetry{ + Enabled: true, + MaxAttempts: maxRetryCount, + MaxDuration: 10 * time.Second, + Interval: 200 * time.Millisecond, + Expression: expression, + }, + }, + }) + opts.OnRetryFunc = retryCounterFunc + options := core.WithSubgraphRetryOptions(opts) testenv.Run(t, &testenv.Config{ NoRetryClient: true, @@ -208,7 +257,19 @@ func TestRetry(t *testing.T) { expression := "statusCode == 429" headerRetryIntervalInSeconds := 1 - options := core.WithSubgraphRetryOptions(true, "", maxRetryCount, 2000*time.Second, 100*time.Millisecond, expression, retryCounterFunc) + opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{ + All: config.GlobalSubgraphRequestRule{ + BackoffJitterRetry: config.BackoffJitterRetry{ + Enabled: true, + MaxAttempts: maxRetryCount, + MaxDuration: 2000 * time.Second, + Interval: 100 * time.Millisecond, + Expression: expression, + }, + }, + }) + opts.OnRetryFunc = retryCounterFunc + options := core.WithSubgraphRetryOptions(opts) testenv.Run(t, &testenv.Config{ NoRetryClient: true, @@ -260,7 +321,19 @@ func TestFlakyRetry(t *testing.T) { maxDuration := 100 * time.Millisecond expression := "true" - options := core.WithSubgraphRetryOptions(true, "", maxRetryCount, maxDuration, retryInterval, expression, retryCounterFunc) + opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{ + All: config.GlobalSubgraphRequestRule{ + BackoffJitterRetry: config.BackoffJitterRetry{ + Enabled: true, + MaxAttempts: maxRetryCount, + MaxDuration: maxDuration, + Interval: retryInterval, + Expression: expression, + }, + }, + }) + opts.OnRetryFunc = retryCounterFunc + options := core.WithSubgraphRetryOptions(opts) testenv.Run(t, &testenv.Config{ NoRetryClient: true, @@ -315,7 +388,19 @@ func TestFlakyRetry(t *testing.T) { maxRetryCount := 3 expression := "statusCode == 429" - options := core.WithSubgraphRetryOptions(true, "", maxRetryCount, 1000*time.Millisecond, retryInterval, expression, retryCounterFunc) + opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{ + All: config.GlobalSubgraphRequestRule{ + BackoffJitterRetry: config.BackoffJitterRetry{ + Enabled: true, + MaxAttempts: maxRetryCount, + MaxDuration: 1000 * time.Millisecond, + Interval: retryInterval, + Expression: expression, + }, + }, + }) + opts.OnRetryFunc = retryCounterFunc + options := core.WithSubgraphRetryOptions(opts) testenv.Run(t, &testenv.Config{ NoRetryClient: true, @@ -363,7 +448,19 @@ func TestFlakyRetry(t *testing.T) { emptyRetryInterval := 0 retryInterval := 300 * time.Millisecond - options := core.WithSubgraphRetryOptions(true, "", maxRetryCount, 1000*time.Millisecond, retryInterval, expression, retryCounterFunc) + opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{ + All: config.GlobalSubgraphRequestRule{ + BackoffJitterRetry: config.BackoffJitterRetry{ + Enabled: true, + MaxAttempts: maxRetryCount, + MaxDuration: 1000 * time.Millisecond, + Interval: retryInterval, + Expression: expression, + }, + }, + }) + opts.OnRetryFunc = retryCounterFunc + options := core.WithSubgraphRetryOptions(opts) testenv.Run(t, &testenv.Config{ NoRetryClient: true, @@ -399,3 +496,225 @@ func TestFlakyRetry(t *testing.T) { }) }) } + +func TestRetryPerSubgraph(t *testing.T) { + t.Parallel() + + t.Run("verify invalid algorithm is detected for base", func(t *testing.T) { + t.Parallel() + + // Configure per-subgraph retry: employees gets 3 retries, test1 gets 1 retry + opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{ + All: config.GlobalSubgraphRequestRule{ + BackoffJitterRetry: config.BackoffJitterRetry{ + Enabled: true, + Algorithm: "invalid_algorithm", + MaxAttempts: 2, + MaxDuration: 2 * time.Second, + Interval: 10 * time.Millisecond, + Expression: "true", + }, + }, + }) + options := core.WithSubgraphRetryOptions(opts) + + err := testenv.RunWithError(t, &testenv.Config{ + NoRetryClient: true, + RouterOptions: []core.Option{ + options, + }, + }, func(t *testing.T, xEnv *testenv.Environment) { + require.Fail(t, "expected initialization to fail due to invalid algorithm") + }) + + require.ErrorContains(t, err, "unsupported retry algorithm") + }) + + t.Run("verify invalid algorithm is detected for per subgraphs", func(t *testing.T) { + t.Parallel() + + // Configure per-subgraph retry: employees gets 3 retries, test1 gets 1 retry + opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{ + Subgraphs: map[string]config.GlobalSubgraphRequestRule{ + "employees": { + BackoffJitterRetry: config.BackoffJitterRetry{ + Enabled: true, + Algorithm: "invalid_algorithm", + MaxAttempts: 2, + MaxDuration: 2 * time.Second, + Interval: 10 * time.Millisecond, + Expression: "true", + }, + }, + }, + }) + options := core.WithSubgraphRetryOptions(opts) + + err := testenv.RunWithError(t, &testenv.Config{ + NoRetryClient: true, + RouterOptions: []core.Option{ + options, + }, + }, func(t *testing.T, xEnv *testenv.Environment) { + require.Fail(t, "expected initialization to fail due to invalid algorithm") + }) + + require.ErrorContains(t, err, "unsupported retry algorithm") + }) + + t.Run("verify retries are applied per subgraph", func(t *testing.T) { + t.Parallel() + + employeesCalls := atomic.Int32{} + test1Calls := atomic.Int32{} + + // Configure per-subgraph retry: employees gets 3 retries, test1 gets 1 retry + employeesMax := 3 + test1Max := 1 + opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{ + Subgraphs: map[string]config.GlobalSubgraphRequestRule{ + "employees": { + BackoffJitterRetry: config.BackoffJitterRetry{ + Enabled: true, + MaxAttempts: employeesMax, + MaxDuration: 2 * time.Second, + Interval: 10 * time.Millisecond, + Expression: "true", + }, + }, + "test1": { + BackoffJitterRetry: config.BackoffJitterRetry{ + Enabled: true, + MaxAttempts: test1Max, + MaxDuration: 2 * time.Second, + Interval: 10 * time.Millisecond, + Expression: "true", + }, + }, + }, + }) + options := core.WithSubgraphRetryOptions(opts) + + testenv.Run(t, &testenv.Config{ + NoRetryClient: true, + RouterOptions: []core.Option{ + options, + }, + Subgraphs: testenv.SubgraphsConfig{ + Employees: testenv.SubgraphConfig{ + Middleware: func(_ http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusBadGateway) + employeesCalls.Add(1) + }) + }, + }, + Test1: testenv.SubgraphConfig{ + Middleware: func(handler http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusBadGateway) + test1Calls.Add(1) + }) + }, + }, + }, + }, func(t *testing.T, xEnv *testenv.Environment) { + // 1) Call employees-only query; expect employees subgraph to be retried employeesMax times (attempts = retries + 1) + resEmp, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{ + Query: `query employees { employees { id } }`, + }) + require.NoError(t, err) + require.Equal(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'employees', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"employees":null}}`, resEmp.Body) + require.Equal(t, int32(employeesMax+1), employeesCalls.Load()) + require.Equal(t, int32(0), test1Calls.Load()) + + // 2) Call test1-only query; expect test subgraph to be retried test1Max times (attempts = retries + 1) + resTest1, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{ + Query: `query { floatField(arg: 1.5) }`, + }) + + require.NoError(t, err) + require.Contains(t, resTest1.Body, `"statusCode":502`) + require.Equal(t, int32(employeesMax+1), employeesCalls.Load()) + require.Equal(t, int32(test1Max+1), test1Calls.Load()) + }) + }) + + t.Run("verify retries are applied per subgraph when mixed configurations", func(t *testing.T) { + t.Parallel() + + employeesCalls := atomic.Int32{} + test1Calls := atomic.Int32{} + + // Configure per-subgraph retry: employees gets 3 retries, test1 gets 1 retry + employeesMax := 3 + test1Max := 1 + opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{ + All: config.GlobalSubgraphRequestRule{ + BackoffJitterRetry: config.BackoffJitterRetry{ + Enabled: true, + MaxAttempts: employeesMax, + MaxDuration: 2 * time.Second, + Interval: 10 * time.Millisecond, + Expression: "true", + }, + }, + Subgraphs: map[string]config.GlobalSubgraphRequestRule{ + "test1": { + BackoffJitterRetry: config.BackoffJitterRetry{ + Enabled: true, + MaxAttempts: test1Max, + MaxDuration: 2 * time.Second, + Interval: 10 * time.Millisecond, + Expression: "true", + }, + }, + }, + }) + options := core.WithSubgraphRetryOptions(opts) + + testenv.Run(t, &testenv.Config{ + NoRetryClient: true, + RouterOptions: []core.Option{ + options, + }, + Subgraphs: testenv.SubgraphsConfig{ + Employees: testenv.SubgraphConfig{ + Middleware: func(_ http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusBadGateway) + employeesCalls.Add(1) + }) + }, + }, + Test1: testenv.SubgraphConfig{ + Middleware: func(handler http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusBadGateway) + test1Calls.Add(1) + }) + }, + }, + }, + }, func(t *testing.T, xEnv *testenv.Environment) { + // 1) Call employees-only query; expect employees subgraph to be retried employeesMax times (attempts = retries + 1) + resEmp, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{ + Query: `query employees { employees { id } }`, + }) + require.NoError(t, err) + require.Equal(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'employees', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"employees":null}}`, resEmp.Body) + require.Equal(t, int32(employeesMax+1), employeesCalls.Load()) + require.Equal(t, int32(0), test1Calls.Load()) + + // 2) Call test1-only query; expect test subgraph to be retried test1Max times (attempts = retries + 1) + resTest1, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{ + Query: `query { floatField(arg: 1.5) }`, + }) + + require.NoError(t, err) + require.Contains(t, resTest1.Body, `"statusCode":502`) + require.Equal(t, int32(employeesMax+1), employeesCalls.Load()) + require.Equal(t, int32(test1Max+1), test1Calls.Load()) + }) + }) +} diff --git a/router-tests/structured_logging_test.go b/router-tests/structured_logging_test.go index 7464fa26c9..b452c03c13 100644 --- a/router-tests/structured_logging_test.go +++ b/router-tests/structured_logging_test.go @@ -709,7 +709,7 @@ func TestFlakyAccessLogs(t *testing.T) { EnableSingleFlight: true, MaxConcurrentResolvers: 1, }), - core.WithSubgraphRetryOptions(false, "", 0, 0, 0, "", nil), + core.WithSubgraphRetryOptions(core.NewSubgraphRetryOptions(config.TrafficShapingRules{All: config.GlobalSubgraphRequestRule{BackoffJitterRetry: config.BackoffJitterRetry{Enabled: false}}})), }, LogObservation: testenv.LogObservationConfig{ Enabled: true, @@ -828,7 +828,7 @@ func TestFlakyAccessLogs(t *testing.T) { EnableSingleFlight: true, MaxConcurrentResolvers: 1, }), - core.WithSubgraphRetryOptions(false, "", 0, 0, 0, "", nil), + core.WithSubgraphRetryOptions(core.NewSubgraphRetryOptions(config.TrafficShapingRules{All: config.GlobalSubgraphRequestRule{BackoffJitterRetry: config.BackoffJitterRetry{Enabled: false}}})), }, LogObservation: testenv.LogObservationConfig{ Enabled: true, @@ -960,7 +960,7 @@ func TestFlakyAccessLogs(t *testing.T) { EnableSingleFlight: true, MaxConcurrentResolvers: 1, }), - core.WithSubgraphRetryOptions(false, "", 0, 0, 0, "", nil), + core.WithSubgraphRetryOptions(core.NewSubgraphRetryOptions(config.TrafficShapingRules{All: config.GlobalSubgraphRequestRule{BackoffJitterRetry: config.BackoffJitterRetry{Enabled: false}}})), }, LogObservation: testenv.LogObservationConfig{ Enabled: true, @@ -1096,7 +1096,7 @@ func TestFlakyAccessLogs(t *testing.T) { EnableSingleFlight: true, MaxConcurrentResolvers: 1, }), - core.WithSubgraphRetryOptions(false, "", 0, 0, 0, "", nil), + core.WithSubgraphRetryOptions(core.NewSubgraphRetryOptions(config.TrafficShapingRules{All: config.GlobalSubgraphRequestRule{BackoffJitterRetry: config.BackoffJitterRetry{Enabled: false}}})), }, LogObservation: testenv.LogObservationConfig{ Enabled: true, @@ -2211,7 +2211,7 @@ func TestFlakyAccessLogs(t *testing.T) { EnableSingleFlight: true, MaxConcurrentResolvers: 1, }), - core.WithSubgraphRetryOptions(false, "", 0, 0, 0, "", nil), + core.WithSubgraphRetryOptions(core.NewSubgraphRetryOptions(config.TrafficShapingRules{All: config.GlobalSubgraphRequestRule{BackoffJitterRetry: config.BackoffJitterRetry{Enabled: false}}})), }, LogObservation: testenv.LogObservationConfig{ Enabled: true, diff --git a/router-tests/telemetry/telemetry_test.go b/router-tests/telemetry/telemetry_test.go index 2b4987c0b7..205128a295 100644 --- a/router-tests/telemetry/telemetry_test.go +++ b/router-tests/telemetry/telemetry_test.go @@ -826,7 +826,6 @@ func TestFlakyOperationCacheTelemetry(t *testing.T) { metricdatatest.AssertEqual(t, maxCostMetrics, *integration.GetMetricByName(cacheScope, "router.graphql.cache.cost.max"), metricdatatest.IgnoreTimestamp()) }) }) - t.Run("Validate operation cache telemetry for persisted and non persisted operations", func(t *testing.T) { t.Parallel() metricReader := metric.NewManualReader() @@ -1204,7 +1203,6 @@ func TestFlakyOperationCacheTelemetry(t *testing.T) { metricdatatest.AssertEqual(t, maxCostMetrics, *integration.GetMetricByName(cacheScope, "router.graphql.cache.cost.max"), metricdatatest.IgnoreTimestamp()) }) }) - t.Run("Validate operation cache telemetry when prometheus is also enabled", func(t *testing.T) { t.Parallel() metricReader := metric.NewManualReader() @@ -1546,7 +1544,6 @@ func TestFlakyOperationCacheTelemetry(t *testing.T) { metricdatatest.AssertEqual(t, maxCostMetrics, *integration.GetMetricByName(cacheScope, "router.graphql.cache.cost.max"), metricdatatest.IgnoreTimestamp()) }) }) - t.Run("Validate key and cost eviction metrics with small validation cache size without feature flags", func(t *testing.T) { t.Parallel() metricReader := metric.NewManualReader() @@ -1890,7 +1887,6 @@ func TestFlakyOperationCacheTelemetry(t *testing.T) { metricdatatest.AssertEqual(t, maxCostMetrics, *integration.GetMetricByName(cacheScope, "router.graphql.cache.cost.max"), metricdatatest.IgnoreTimestamp()) }) }) - t.Run("Validate operation cache telemetry for default configuration including feature flags", func(t *testing.T) { t.Parallel() metricReader := metric.NewManualReader() @@ -3562,7 +3558,6 @@ func TestFlakyTelemetry(t *testing.T) { }, }, } - want := metricdata.ScopeMetrics{ Scope: instrumentation.Scope{ Name: "cosmo.router", @@ -4087,7 +4082,6 @@ func TestFlakyTelemetry(t *testing.T) { require.Contains(t, sn[1].Attributes(), otel.WgEnginePersistedOperationCacheHit.Bool(true)) }) }) - t.Run("Should exclude high cardinality attributes only from metrics if custom exporter is defined", func(t *testing.T) { t.Parallel() @@ -4735,7 +4729,6 @@ func TestFlakyTelemetry(t *testing.T) { require.Contains(t, rm.Resource.Attributes(), attribute.String("service.name", "cosmo-router")) require.Len(t, rm.ScopeMetrics, defaultExposedScopedMetricsCount) - scopeMetric := *integration.GetMetricScopeByName(rm.ScopeMetrics, "cosmo.router") require.Len(t, scopeMetric.Metrics, defaultCosmoRouterMetricsCount) @@ -5106,7 +5099,6 @@ func TestFlakyTelemetry(t *testing.T) { }) }) - t.Run("Should remap metric name to configured value", func(t *testing.T) { t.Parallel() @@ -5447,7 +5439,6 @@ func TestFlakyTelemetry(t *testing.T) { metricdatatest.AssertEqual(t, routerInfoMetric, scopeMetric.Metrics[6], metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue()) }) }) - t.Run("Custom span and resource attributes are attached to all metrics and spans / from header", func(t *testing.T) { t.Parallel() @@ -5879,7 +5870,6 @@ func TestFlakyTelemetry(t *testing.T) { metricdatatest.AssertEqual(t, routerInfoMetric, scopeMetric.Metrics[6], metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue()) }) }) - t.Run("Custom span and resource attributes are attached to all metrics and spans / static", func(t *testing.T) { t.Parallel() @@ -6309,7 +6299,6 @@ func TestFlakyTelemetry(t *testing.T) { metricdatatest.AssertEqual(t, routerInfoMetric, scopeMetric.Metrics[6], metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue()) }) }) - t.Run("Requesting a feature flags will emit different router config version and add the feature flag attribute", func(t *testing.T) { t.Parallel() @@ -6950,7 +6939,6 @@ func TestFlakyTelemetry(t *testing.T) { require.Equal(t, sdktrace.Status{Code: codes.Unset}, sn[8].Status()) }) }) - t.Run("Origin connectivity issue is traced", func(t *testing.T) { t.Parallel() @@ -7532,7 +7520,6 @@ func TestFlakyTelemetry(t *testing.T) { integration.AssertAttributeNotInSet(t, resClFiltered.DataPoints[1].Attributes, otel.WgOperationName.String("")) }) }) - t.Run("Custom Metric Attributes", func(t *testing.T) { t.Parallel() @@ -8104,7 +8091,6 @@ func TestFlakyTelemetry(t *testing.T) { metricdatatest.AssertEqual(t, want, scopeMetric, metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue()) }) }) - t.Run("should not remap high cardinality fields when using cloud exporter but include custom metric attributes", func(t *testing.T) { t.Parallel() @@ -8706,7 +8692,11 @@ func TestFlakyTelemetry(t *testing.T) { }, }, RouterOptions: []core.Option{ - core.WithSubgraphRetryOptions(false, "", 0, 0, 0, "", nil), + core.WithSubgraphRetryOptions(core.NewSubgraphRetryOptions(config.TrafficShapingRules{ + All: config.GlobalSubgraphRequestRule{ + BackoffJitterRetry: config.BackoffJitterRetry{Enabled: false}, + }, + })), }, Subgraphs: testenv.SubgraphsConfig{ Products: testenv.SubgraphConfig{ @@ -8742,7 +8732,6 @@ func TestFlakyTelemetry(t *testing.T) { require.True(t, found, "expected to find a datapoint with subgraph name and id in the metrics") }) }) - t.Run("Tracing is not affected by custom metric attributes", func(t *testing.T) { t.Parallel() @@ -9187,7 +9176,6 @@ func TestFlakyTelemetry(t *testing.T) { assert.ErrorAs(t, err, &expectedErr) }) }) - t.Run("custom trace metrics with expression", func(t *testing.T) { t.Parallel() @@ -9592,7 +9580,6 @@ func TestFlakyTelemetry(t *testing.T) { }) }) }) - t.Run("verify attribute expressions with subgraph in the expression", func(t *testing.T) { t.Run("verify subgraph expression should only be present for engine fetch", func(t *testing.T) { t.Parallel() @@ -9650,8 +9637,8 @@ func TestFlakyTelemetry(t *testing.T) { require.IsType(t, metricdata.Sum[int64]{}, httpRequestsMetric.Data) data2 := httpRequestsMetric.Data.(metricdata.Sum[int64]) - attrs := data2.DataPoints[0].Attributes - _, ok := attrs.Value(attribute.Key(key)) + atts := data2.DataPoints[0].Attributes + _, ok := atts.Value(attribute.Key(key)) require.False(t, ok) }) }) @@ -10204,7 +10191,6 @@ func TestExcludeAttributesWithCustomExporter(t *testing.T) { metricdatatest.AssertEqual(t, connectionMetrics, *integration.GetMetricByName(engineScope, "router.engine.connections"), metricdatatest.IgnoreTimestamp()) }) }) - t.Run(fmt.Sprintf("cache metrics for %s", usingCustomExporter), func(t *testing.T) { t.Parallel() metricReader := metric.NewManualReader() diff --git a/router/core/graph_server.go b/router/core/graph_server.go index 875b0969d5..6079066580 100644 --- a/router/core/graph_server.go +++ b/router/core/graph_server.go @@ -40,6 +40,7 @@ import ( rmiddleware "github.com/wundergraph/cosmo/router/internal/middleware" "github.com/wundergraph/cosmo/router/internal/recoveryhandler" "github.com/wundergraph/cosmo/router/internal/requestlogger" + "github.com/wundergraph/cosmo/router/internal/retrytransport" "github.com/wundergraph/cosmo/router/pkg/config" "github.com/wundergraph/cosmo/router/pkg/cors" "github.com/wundergraph/cosmo/router/pkg/execution_config" @@ -100,6 +101,7 @@ type ( traceDialer *TraceDialer connector *grpcconnector.Connector circuitBreakerManager *circuit.Manager + retryManager *retrytransport.Manager } ) @@ -261,6 +263,28 @@ func newGraphServer(ctx context.Context, r *Router, routerConfig *nodev1.RouterC s.circuitBreakerManager = manager } + if s.retryOptions.IsEnabled() { + retryExprManager := expr.NewRetryExpressionManager() + // Build retry options and handle any expression compilation errors + shouldRetryFunc, err := BuildRetryFunction(retryExprManager) + if err != nil { + return nil, fmt.Errorf("failed to build retry function: %w", err) + } + + retryManager := retrytransport.NewManager(retryExprManager, shouldRetryFunc, s.retryOptions.OnRetryFunc) + + err = retryManager.Initialize( + s.retryOptions.All, + s.retryOptions.SubgraphMap, + routerConfig.GetSubgraphs(), + ) + if err != nil { + return nil, err + } + + s.retryManager = retryManager + } + routingUrlGroupings, err := getRoutingUrlGroupingForCircuitBreakers(routerConfig, s.overrideRoutingURLConfiguration, s.overrides) if err != nil { return nil, err @@ -1155,12 +1179,6 @@ func (s *graphServer) buildGraphMux( baseConnMetricStore = s.connectionMetrics } - // Build retry options and handle any expression compilation errors - processedRetryOptions, err := ProcessRetryOptions(s.retryOptions) - if err != nil { - return nil, fmt.Errorf("failed to process retry options: %w", err) - } - ecb := &ExecutorConfigurationBuilder{ introspection: s.introspection, baseURL: s.baseURL, @@ -1181,13 +1199,13 @@ func (s *graphServer) buildGraphMux( PostHandlers: s.postOriginHandlers, MetricStore: gm.metricStore, ConnectionMetricStore: baseConnMetricStore, - RetryOptions: *processedRetryOptions, TracerProvider: s.tracerProvider, TracePropagators: s.compositePropagator, LocalhostFallbackInsideDocker: s.localhostFallbackInsideDocker, Logger: s.logger, EnableTraceClient: enableTraceClient, CircuitBreaker: s.circuitBreakerManager, + RetryManager: s.retryManager, }, } diff --git a/router/core/retry_builder.go b/router/core/retry_builder.go index 30fffcb53f..e563b58d3a 100644 --- a/router/core/retry_builder.go +++ b/router/core/retry_builder.go @@ -1,7 +1,6 @@ package core import ( - "fmt" "net/http" "strings" @@ -10,74 +9,10 @@ import ( "go.uber.org/zap" ) -const ( - defaultRetryExpression = "IsRetryableStatusCode() || IsConnectionError() || IsTimeout()" - - backoffJitter = "backoff_jitter" -) - -var noopRetryFunc = func(err error, req *http.Request, resp *http.Response) bool { - return false -} - -func ProcessRetryOptions(retryOpts retrytransport.RetryOptions) (*retrytransport.RetryOptions, error) { - // Default to backOffJitter if no algorithm is specified - // This will occur either in tests or if the user explicitly makes it an empty string - if retryOpts.Algorithm == "" { - retryOpts.Algorithm = backoffJitter - } - - // We skip validating the algorithm if retries are disabled - if retryOpts.Enabled && retryOpts.Algorithm != backoffJitter { - return nil, fmt.Errorf("unsupported retry algorithm: %s", retryOpts.Algorithm) - } - - shouldRetryFunc, err := buildRetryFunction(retryOpts) - if err != nil { - return nil, fmt.Errorf("failed to build retry function: %w", err) - } - - // Create copy to not mutate the original reference - retryOptions := retrytransport.RetryOptions{ - Enabled: retryOpts.Enabled, - Algorithm: retryOpts.Algorithm, - MaxRetryCount: retryOpts.MaxRetryCount, - MaxDuration: retryOpts.MaxDuration, - Interval: retryOpts.Interval, - Expression: retryOpts.Expression, - - OnRetry: retryOpts.OnRetry, - - ShouldRetry: shouldRetryFunc, - } - - return &retryOptions, nil -} - // BuildRetryFunction creates a ShouldRetry function based on the provided expression -func buildRetryFunction(retryOpts retrytransport.RetryOptions) (retrytransport.ShouldRetryFunc, error) { - // We do not need to build a retry function if retries are disabled - // This means that any bad expressions are ignored if retries are disabled - if !retryOpts.Enabled { - return noopRetryFunc, nil - } - - // Use default expression if empty string is passed - expression := retryOpts.Expression - if expression == "" { - expression = defaultRetryExpression - } - - // Create the retry expression manager - manager, err := expr.NewRetryExpressionManager(expression) - if err != nil { - return nil, fmt.Errorf("failed to create expression manager: %w", err) - } - - // Return expression-based retry function - return func(err error, req *http.Request, resp *http.Response) bool { +func BuildRetryFunction(manager *expr.RetryExpressionManager) (retrytransport.ShouldRetryFunc, error) { + return func(err error, req *http.Request, resp *http.Response, expression string) bool { reqContext := getRequestContext(req.Context()) - if reqContext == nil { return false } @@ -95,7 +30,7 @@ func buildRetryFunction(retryOpts retrytransport.RetryOptions) (retrytransport.S ctx := expr.LoadRetryContext(err, resp) // Evaluate the expression - shouldRetry, evalErr := manager.ShouldRetry(ctx) + shouldRetry, evalErr := manager.ShouldRetry(ctx, expression) if evalErr != nil { reqContext.logger.Error("Failed to evaluate retry expression", zap.Error(evalErr), diff --git a/router/core/retry_builder_test.go b/router/core/retry_builder_test.go index d4fa1cf3f6..6a2d34cdab 100644 --- a/router/core/retry_builder_test.go +++ b/router/core/retry_builder_test.go @@ -2,15 +2,13 @@ package core import ( "errors" - "fmt" - "github.com/wundergraph/cosmo/router/internal/retrytransport" "io" "net/http" - "reflect" "syscall" "testing" "github.com/stretchr/testify/assert" + "github.com/wundergraph/cosmo/router/internal/expr" "go.uber.org/zap" ) @@ -52,23 +50,19 @@ func createRequestWithContext(opType string) (*http.Request, *requestContext) { func TestBuildRetryFunction(t *testing.T) { t.Run("build function when retry is disabled", func(t *testing.T) { - fn, err := buildRetryFunction(retrytransport.RetryOptions{ - Enabled: false, - Expression: "invalid expression ++++++", - }) + manager := expr.NewRetryExpressionManager() + fn, err := BuildRetryFunction(manager) assert.NoError(t, err) - assert.Equal(t, - reflect.ValueOf(noopRetryFunc).Pointer(), - reflect.ValueOf(fn).Pointer(), - ) + assert.NotNil(t, fn) }) t.Run("default expression behavior", func(t *testing.T) { // Use the default expression that would be in the config - fn, err := buildRetryFunction(retrytransport.RetryOptions{ - Enabled: true, - Expression: defaultRetryExpression, - }) + manager := expr.NewRetryExpressionManager() + err := manager.AddExpression("") + assert.NoError(t, err) + + fn, err := BuildRetryFunction(manager) assert.NoError(t, err) assert.NotNil(t, fn) @@ -77,25 +71,26 @@ func TestBuildRetryFunction(t *testing.T) { // Test default behavior - should retry on 500 resp := &http.Response{StatusCode: 500} - assert.True(t, fn(nil, req, resp)) + assert.True(t, fn(nil, req, resp, "")) // Should not retry on 200 resp.StatusCode = 200 - assert.False(t, fn(nil, req, resp)) + assert.False(t, fn(nil, req, resp, "")) // Test with errors - only expression-defined errors are handled here - assert.True(t, fn(syscall.ETIMEDOUT, req, nil)) - assert.True(t, fn(errors.New("connection refused"), req, nil)) - assert.True(t, fn(errors.New("unexpected EOF"), req, nil)) // EOF is now handled at transport layer, not expression - assert.False(t, fn(errors.New("some other error"), req, nil)) + assert.True(t, fn(syscall.ETIMEDOUT, req, nil, "")) + assert.True(t, fn(errors.New("connection refused"), req, nil, "")) + assert.True(t, fn(errors.New("unexpected EOF"), req, nil, "")) // EOF is now handled at transport layer, not expression + assert.False(t, fn(errors.New("some other error"), req, nil, "")) }) t.Run("expression-based retry", func(t *testing.T) { expression := "statusCode == 500 || statusCode == 503" - fn, err := buildRetryFunction(retrytransport.RetryOptions{ - Enabled: true, - Expression: expression, - }) + manager := expr.NewRetryExpressionManager() + err := manager.AddExpression(expression) + assert.NoError(t, err) + + fn, err := BuildRetryFunction(manager) assert.NoError(t, err) assert.NotNil(t, fn) @@ -104,23 +99,24 @@ func TestBuildRetryFunction(t *testing.T) { // Should retry on 500 resp := &http.Response{StatusCode: 500} - assert.True(t, fn(nil, req, resp)) + assert.True(t, fn(nil, req, resp, expression)) // Should retry on 503 resp.StatusCode = 503 - assert.True(t, fn(nil, req, resp)) + assert.True(t, fn(nil, req, resp, expression)) // Should not retry on 502 resp.StatusCode = 502 - assert.False(t, fn(nil, req, resp)) + assert.False(t, fn(nil, req, resp, expression)) }) t.Run("expression with error conditions", func(t *testing.T) { expression := "IsTimeout() || statusCode == 503" - fn, err := buildRetryFunction(retrytransport.RetryOptions{ - Enabled: true, - Expression: expression, - }) + manager := expr.NewRetryExpressionManager() + err := manager.AddExpression(expression) + assert.NoError(t, err) + + fn, err := BuildRetryFunction(manager) assert.NoError(t, err) assert.NotNil(t, fn) @@ -129,33 +125,31 @@ func TestBuildRetryFunction(t *testing.T) { // Should retry on timeout error err = syscall.ETIMEDOUT - assert.True(t, fn(err, req, nil)) + assert.True(t, fn(err, req, nil, expression)) // Should retry on 503 resp := &http.Response{StatusCode: 503} - assert.True(t, fn(nil, req, resp)) + assert.True(t, fn(nil, req, resp, expression)) // Should not retry on other errors err = errors.New("some other error") - assert.False(t, fn(err, req, nil)) + assert.False(t, fn(err, req, nil, expression)) }) t.Run("invalid expression returns error", func(t *testing.T) { expression := "invalid syntax +++" - fn, err := buildRetryFunction(retrytransport.RetryOptions{ - Enabled: true, - Expression: expression, - }) + manager := expr.NewRetryExpressionManager() + err := manager.AddExpression(expression) assert.Error(t, err) - assert.Nil(t, fn) assert.Contains(t, err.Error(), "failed to compile retry expression") }) t.Run("empty expression uses default", func(t *testing.T) { - fn, err := buildRetryFunction(retrytransport.RetryOptions{ - Enabled: true, - Expression: "", - }) + manager := expr.NewRetryExpressionManager() + err := manager.AddExpression("") + assert.NoError(t, err) + + fn, err := BuildRetryFunction(manager) assert.NoError(t, err) assert.NotNil(t, fn) @@ -164,42 +158,44 @@ func TestBuildRetryFunction(t *testing.T) { // Test with retryable status code resp := &http.Response{StatusCode: 502} - assert.True(t, fn(nil, req, resp)) + assert.True(t, fn(nil, req, resp, "")) // Test with connection error err = errors.New("connection refused") - assert.True(t, fn(err, req, nil)) + assert.True(t, fn(err, req, nil, "")) // Test with timeout error err = syscall.ETIMEDOUT - assert.True(t, fn(err, req, nil)) + assert.True(t, fn(err, req, nil, "")) // Test with non-retryable error err = errors.New("some other error") - assert.False(t, fn(err, req, nil)) + assert.False(t, fn(err, req, nil, "")) }) t.Run("expression that always returns false but the error is an eof error", func(t *testing.T) { expression := "false" // Don't retry - fn, err := buildRetryFunction(retrytransport.RetryOptions{ - Enabled: true, - Expression: expression, - }) + manager := expr.NewRetryExpressionManager() + err := manager.AddExpression(expression) + assert.NoError(t, err) + + fn, err := BuildRetryFunction(manager) assert.NoError(t, err) assert.NotNil(t, fn) // Create a request with proper query context req, _ := createRequestWithContext(OperationTypeQuery) - assert.True(t, fn(io.ErrUnexpectedEOF, req, nil)) + assert.True(t, fn(io.ErrUnexpectedEOF, req, nil, expression)) }) t.Run("expression that always returns true", func(t *testing.T) { expression := "true" // Always retry - fn, err := buildRetryFunction(retrytransport.RetryOptions{ - Enabled: true, - Expression: expression, - }) + manager := expr.NewRetryExpressionManager() + err := manager.AddExpression(expression) + assert.NoError(t, err) + + fn, err := BuildRetryFunction(manager) assert.NoError(t, err) assert.NotNil(t, fn) @@ -208,19 +204,20 @@ func TestBuildRetryFunction(t *testing.T) { resp := &http.Response{StatusCode: 500} // Should retry when expression is true - assert.True(t, fn(nil, req, resp)) + assert.True(t, fn(nil, req, resp, expression)) // Even for status codes that wouldn't normally retry resp.StatusCode = 200 - assert.True(t, fn(nil, req, resp)) + assert.True(t, fn(nil, req, resp, expression)) }) t.Run("complex expression", func(t *testing.T) { expression := "(statusCode >= 500 && statusCode < 600) || IsConnectionError()" - fn, err := buildRetryFunction(retrytransport.RetryOptions{ - Enabled: true, - Expression: expression, - }) + manager := expr.NewRetryExpressionManager() + err := manager.AddExpression(expression) + assert.NoError(t, err) + + fn, err := BuildRetryFunction(manager) assert.NoError(t, err) assert.NotNil(t, fn) @@ -229,25 +226,26 @@ func TestBuildRetryFunction(t *testing.T) { // Test 5xx errors resp := &http.Response{StatusCode: 503} - assert.True(t, fn(nil, req, resp)) + assert.True(t, fn(nil, req, resp, expression)) // Test connection error err = errors.New("connection refused") - assert.True(t, fn(err, req, nil)) + assert.True(t, fn(err, req, nil, expression)) // Test non-matching conditions resp.StatusCode = 404 err = errors.New("some other error") - assert.False(t, fn(err, req, resp)) + assert.False(t, fn(err, req, resp, expression)) }) t.Run("mutation never retries with proper context", func(t *testing.T) { // Use expression that would normally retry on 500 errors expression := "statusCode >= 500 || IsTimeout() || IsConnectionError()" - fn, err := buildRetryFunction(retrytransport.RetryOptions{ - Enabled: true, - Expression: expression, - }) + manager := expr.NewRetryExpressionManager() + err := manager.AddExpression(expression) + assert.NoError(t, err) + + fn, err := BuildRetryFunction(manager) assert.NoError(t, err) assert.NotNil(t, fn) @@ -256,29 +254,28 @@ func TestBuildRetryFunction(t *testing.T) { // Test with 500 status - should NOT retry because it's a mutation resp := &http.Response{StatusCode: 500} - assert.False(t, fn(nil, req, resp)) + assert.False(t, fn(nil, req, resp, expression)) // Test with timeout error - should NOT retry because it's a mutation - assert.False(t, fn(syscall.ETIMEDOUT, req, nil)) + assert.False(t, fn(syscall.ETIMEDOUT, req, nil, expression)) // Test with connection error - should NOT retry because it's a mutation - assert.False(t, fn(errors.New("connection refused"), req, nil)) + assert.False(t, fn(errors.New("connection refused"), req, nil, expression)) // Test with expression that always returns true - should still NOT retry - alwaysRetryFn, err := buildRetryFunction(retrytransport.RetryOptions{ - Enabled: true, - Expression: "true", - }) + alwaysRetryExpression := "true" + err = manager.AddExpression(alwaysRetryExpression) assert.NoError(t, err) - assert.False(t, alwaysRetryFn(nil, req, resp)) + assert.False(t, fn(nil, req, resp, alwaysRetryExpression)) }) t.Run("query retries with proper context", func(t *testing.T) { expression := "statusCode >= 500 || IsTimeout()" - fn, err := buildRetryFunction(retrytransport.RetryOptions{ - Enabled: true, - Expression: expression, - }) + manager := expr.NewRetryExpressionManager() + err := manager.AddExpression(expression) + assert.NoError(t, err) + + fn, err := BuildRetryFunction(manager) assert.NoError(t, err) assert.NotNil(t, fn) @@ -287,22 +284,23 @@ func TestBuildRetryFunction(t *testing.T) { // Test with 500 status - should retry because it's a query resp := &http.Response{StatusCode: 500} - assert.True(t, fn(nil, req, resp)) + assert.True(t, fn(nil, req, resp, expression)) // Test with timeout error - should retry because it's a query - assert.True(t, fn(syscall.ETIMEDOUT, req, nil)) + assert.True(t, fn(syscall.ETIMEDOUT, req, nil, expression)) // Test with 200 status - should not retry even for query resp.StatusCode = 200 - assert.False(t, fn(nil, req, resp)) + assert.False(t, fn(nil, req, resp, expression)) }) t.Run("subscription retries with proper context", func(t *testing.T) { expression := "statusCode >= 500" - fn, err := buildRetryFunction(retrytransport.RetryOptions{ - Enabled: true, - Expression: expression, - }) + manager := expr.NewRetryExpressionManager() + err := manager.AddExpression(expression) + assert.NoError(t, err) + + fn, err := BuildRetryFunction(manager) assert.NoError(t, err) assert.NotNil(t, fn) @@ -311,20 +309,21 @@ func TestBuildRetryFunction(t *testing.T) { // Test with 500 status - should retry because it's a subscription (not mutation) resp := &http.Response{StatusCode: 500} - assert.True(t, fn(nil, req, resp)) + assert.True(t, fn(nil, req, resp, expression)) // Test with 200 status - should not retry resp.StatusCode = 200 - assert.False(t, fn(nil, req, resp)) + assert.False(t, fn(nil, req, resp, expression)) }) t.Run("error logging with proper context", func(t *testing.T) { // Test that error logging works with proper request context expression := "statusCode >= 500" - fn, err := buildRetryFunction(retrytransport.RetryOptions{ - Enabled: true, - Expression: expression, - }) + manager := expr.NewRetryExpressionManager() + err := manager.AddExpression(expression) + assert.NoError(t, err) + + fn, err := BuildRetryFunction(manager) assert.NoError(t, err) assert.NotNil(t, fn) @@ -333,18 +332,19 @@ func TestBuildRetryFunction(t *testing.T) { // Test that it works normally with proper context resp := &http.Response{StatusCode: 500} - assert.True(t, fn(nil, req, resp)) + assert.True(t, fn(nil, req, resp, expression)) resp.StatusCode = 200 - assert.False(t, fn(nil, req, resp)) + assert.False(t, fn(nil, req, resp, expression)) }) t.Run("request context with query operation", func(t *testing.T) { expression := "statusCode >= 500" - fn, err := buildRetryFunction(retrytransport.RetryOptions{ - Enabled: true, - Expression: expression, - }) + manager := expr.NewRetryExpressionManager() + err := manager.AddExpression(expression) + assert.NoError(t, err) + + fn, err := BuildRetryFunction(manager) assert.NoError(t, err) assert.NotNil(t, fn) @@ -353,19 +353,20 @@ func TestBuildRetryFunction(t *testing.T) { // Should work with proper request context - expression should be evaluated normally resp := &http.Response{StatusCode: 500} - assert.True(t, fn(nil, req, resp)) + assert.True(t, fn(nil, req, resp, expression)) resp.StatusCode = 200 - assert.False(t, fn(nil, req, resp)) + assert.False(t, fn(nil, req, resp, expression)) }) t.Run("complex expression with mutation context", func(t *testing.T) { // Complex expression that would normally retry in many cases expression := "(statusCode >= 500 && statusCode < 600) || IsConnectionError() || IsTimeout() || statusCode == 429" - fn, err := buildRetryFunction(retrytransport.RetryOptions{ - Enabled: true, - Expression: expression, - }) + manager := expr.NewRetryExpressionManager() + err := manager.AddExpression(expression) + assert.NoError(t, err) + + fn, err := BuildRetryFunction(manager) assert.NoError(t, err) assert.NotNil(t, fn) @@ -374,25 +375,26 @@ func TestBuildRetryFunction(t *testing.T) { // Test various conditions that would normally trigger retry resp := &http.Response{StatusCode: 500} - assert.False(t, fn(nil, req, resp)) + assert.False(t, fn(nil, req, resp, expression)) resp.StatusCode = 503 - assert.False(t, fn(nil, req, resp)) + assert.False(t, fn(nil, req, resp, expression)) resp.StatusCode = 429 - assert.False(t, fn(nil, req, resp)) + assert.False(t, fn(nil, req, resp, expression)) - assert.False(t, fn(syscall.ETIMEDOUT, req, nil)) - assert.False(t, fn(errors.New("connection refused"), req, nil)) + assert.False(t, fn(syscall.ETIMEDOUT, req, nil, expression)) + assert.False(t, fn(errors.New("connection refused"), req, nil, expression)) }) t.Run("new operation with comprehensive retry conditions", func(t *testing.T) { // Create a new comprehensive operation to test all retry scenarios expression := "statusCode >= 500 || statusCode == 429 || IsTimeout() || IsConnectionError()" - fn, err := buildRetryFunction(retrytransport.RetryOptions{ - Enabled: true, - Expression: expression, - }) + manager := expr.NewRetryExpressionManager() + err := manager.AddExpression(expression) + assert.NoError(t, err) + + fn, err := BuildRetryFunction(manager) assert.NoError(t, err) assert.NotNil(t, fn) @@ -401,81 +403,82 @@ func TestBuildRetryFunction(t *testing.T) { // Test 5xx errors resp := &http.Response{StatusCode: 500} - assert.True(t, fn(nil, req, resp)) + assert.True(t, fn(nil, req, resp, expression)) resp.StatusCode = 503 - assert.True(t, fn(nil, req, resp)) + assert.True(t, fn(nil, req, resp, expression)) // Test rate limiting resp.StatusCode = 429 - assert.True(t, fn(nil, req, resp)) + assert.True(t, fn(nil, req, resp, expression)) // Test timeouts - assert.True(t, fn(syscall.ETIMEDOUT, req, nil)) + assert.True(t, fn(syscall.ETIMEDOUT, req, nil, expression)) // Test connection errors - assert.True(t, fn(errors.New("connection refused"), req, nil)) + assert.True(t, fn(errors.New("connection refused"), req, nil, expression)) // Test success - should not retry resp.StatusCode = 200 - assert.False(t, fn(nil, req, resp)) + assert.False(t, fn(nil, req, resp, expression)) // Test client errors - should not retry resp.StatusCode = 404 - assert.False(t, fn(nil, req, resp)) + assert.False(t, fn(nil, req, resp, expression)) // Now test the same conditions with a mutation - should never retry mutationReq, _ := createRequestWithContext(OperationTypeMutation) resp.StatusCode = 500 - assert.False(t, fn(nil, mutationReq, resp)) + assert.False(t, fn(nil, mutationReq, resp, expression)) resp.StatusCode = 503 - assert.False(t, fn(nil, mutationReq, resp)) + assert.False(t, fn(nil, mutationReq, resp, expression)) resp.StatusCode = 429 - assert.False(t, fn(nil, mutationReq, resp)) - assert.False(t, fn(syscall.ETIMEDOUT, mutationReq, nil)) - assert.False(t, fn(errors.New("connection refused"), mutationReq, nil)) + assert.False(t, fn(nil, mutationReq, resp, expression)) + assert.False(t, fn(syscall.ETIMEDOUT, mutationReq, nil, expression)) + assert.False(t, fn(errors.New("connection refused"), mutationReq, nil, expression)) }) } -func TestProcessRetryOptions(t *testing.T) { - t.Run("process invalid algorithm", func(t *testing.T) { - algorithm := "abcdee" - _, err := ProcessRetryOptions(retrytransport.RetryOptions{ - Enabled: true, - Algorithm: algorithm, - }) - - expectedError := fmt.Sprintf("unsupported retry algorithm: %s", algorithm) - assert.ErrorContains(t, err, expectedError) - }) - - t.Run("process invalid algorithm when retries are disabled", func(t *testing.T) { - algorithm := "abcdee" - _, err := ProcessRetryOptions(retrytransport.RetryOptions{ - Enabled: false, - Algorithm: algorithm, - }) - assert.NoError(t, err) - }) - - t.Run("process invalid expression", func(t *testing.T) { - _, err := ProcessRetryOptions(retrytransport.RetryOptions{ - Enabled: true, - Algorithm: "backoff_jitter", - Expression: "invalid syntax +++", - }) - - assert.ErrorContains(t, err, "failed to build retry function") - }) - - t.Run("process valid options", func(t *testing.T) { - options := retrytransport.RetryOptions{ - Enabled: true, - Algorithm: "backoff_jitter", - Expression: "statusCode == 500 || IsTimeout() || IsConnectionError()", - } - response, err := ProcessRetryOptions(options) - assert.NoError(t, err) - assert.NotSame(t, &options, response) - }) -} +// TODO: Fix +//func TestProcessRetryOptions(t *testing.T) { +// t.Run("process invalid algorithm", func(t *testing.T) { +// algorithm := "abcdee" +// _, err := ProcessRetryOptions(retrytransport.RetryOptions{ +// Enabled: true, +// Algorithm: algorithm, +// }) +// +// expectedError := fmt.Sprintf("unsupported retry algorithm: %s", algorithm) +// assert.ErrorContains(t, err, expectedError) +// }) +// +// t.Run("process invalid algorithm when retries are disabled", func(t *testing.T) { +// algorithm := "abcdee" +// _, err := ProcessRetryOptions(retrytransport.RetryOptions{ +// Enabled: false, +// Algorithm: algorithm, +// }) +// assert.NoError(t, err) +// }) +// +// t.Run("process invalid expression", func(t *testing.T) { +// _, err := ProcessRetryOptions(retrytransport.RetryOptions{ +// Enabled: true, +// Algorithm: "backoff_jitter", +// Expression: "invalid syntax +++", +// }) +// +// assert.ErrorContains(t, err, "failed to build retry function") +// }) +// +// t.Run("process valid options", func(t *testing.T) { +// options := retrytransport.RetryOptions{ +// Enabled: true, +// Algorithm: "backoff_jitter", +// Expression: "statusCode == 500 || IsTimeout() || IsConnectionError()", +// } +// response, err := ProcessRetryOptions(options) +// assert.NoError(t, err) +// assert.NotSame(t, &options, response) +// }) +//} diff --git a/router/core/router.go b/router/core/router.go index f5bed12df7..afccb42fbe 100644 --- a/router/core/router.go +++ b/router/core/router.go @@ -184,6 +184,27 @@ func (r *SubgraphCircuitBreakerOptions) IsEnabled() bool { return r.CircuitBreaker.Enabled || len(r.SubgraphMap) > 0 } +type SubgraphRetryOptions struct { + All retrytransport.RetryOptions + SubgraphMap map[string]retrytransport.RetryOptions + OnRetryFunc retrytransport.OnRetryFunc +} + +func (r *SubgraphRetryOptions) IsEnabled() bool { + if r == nil { + return false + } + if r.All.Enabled { + return true + } + for _, cfg := range r.SubgraphMap { + if cfg.Enabled { + return true + } + } + return false +} + // NewRouter creates a new Router instance. Router.Start() must be called to start the server. // Alternatively, use Router.NewServer() to create a new server instance without starting it. func NewRouter(opts ...Option) (*Router, error) { @@ -1752,26 +1773,9 @@ func WithSubgraphCircuitBreakerOptions(opts *SubgraphCircuitBreakerOptions) Opti } } -func WithSubgraphRetryOptions( - enabled bool, - algorithm string, - maxRetryCount int, - retryMaxDuration, retryInterval time.Duration, - expression string, - onRetryFunc retrytransport.OnRetryFunc, -) Option { +func WithSubgraphRetryOptions(opts *SubgraphRetryOptions) Option { return func(r *Router) { - r.retryOptions = retrytransport.RetryOptions{ - Enabled: enabled, - Algorithm: algorithm, - MaxRetryCount: maxRetryCount, - MaxDuration: retryMaxDuration, - Interval: retryInterval, - Expression: expression, - - // Test case overrides - OnRetry: onRetryFunc, - } + r.retryOptions = opts } } @@ -1901,6 +1905,37 @@ func NewSubgraphCircuitBreakerOptions(cfg config.TrafficShapingRules) *SubgraphC return entry } +func NewSubgraphRetryOptions(cfg config.TrafficShapingRules) *SubgraphRetryOptions { + entry := &SubgraphRetryOptions{ + SubgraphMap: map[string]retrytransport.RetryOptions{}, + } + // If we have a global default + if cfg.All.BackoffJitterRetry.Enabled { + entry.All = newRetryConfig(cfg.All.BackoffJitterRetry) + } + // Subgraph specific circuit breakers + for k, v := range cfg.Subgraphs { + entry.SubgraphMap[k] = newRetryConfig(v.BackoffJitterRetry) + } + + return entry +} + +func newRetryConfig(config config.BackoffJitterRetry) retrytransport.RetryOptions { + algorithm := config.Algorithm + if algorithm == "" { + algorithm = retrytransport.BackoffJitter + } + return retrytransport.RetryOptions{ + Enabled: config.Enabled, + Algorithm: algorithm, + MaxRetryCount: config.MaxAttempts, + MaxDuration: config.MaxDuration, + Interval: config.Interval, + Expression: config.Expression, + } +} + func newCircuitBreakerConfig(cb config.CircuitBreaker) circuit.CircuitBreakerConfig { return circuit.CircuitBreakerConfig{ Enabled: cb.Enabled, diff --git a/router/core/router_config.go b/router/core/router_config.go index 2a2a78924d..e88ed342a4 100644 --- a/router/core/router_config.go +++ b/router/core/router_config.go @@ -9,7 +9,6 @@ import ( "github.com/wundergraph/cosmo/router/internal/graphqlmetrics" "github.com/wundergraph/cosmo/router/internal/persistedoperation" rd "github.com/wundergraph/cosmo/router/internal/persistedoperation/operationstorage/redis" - "github.com/wundergraph/cosmo/router/internal/retrytransport" "github.com/wundergraph/cosmo/router/pkg/config" "github.com/wundergraph/cosmo/router/pkg/controlplane/configpoller" "github.com/wundergraph/cosmo/router/pkg/controlplane/selfregister" @@ -77,12 +76,12 @@ type Config struct { headerRules *config.HeaderRules subgraphTransportOptions *SubgraphTransportOptions subgraphCircuitBreakerOptions *SubgraphCircuitBreakerOptions + retryOptions *SubgraphRetryOptions graphqlMetricsConfig *GraphQLMetricsConfig routerTrafficConfig *config.RouterTrafficConfiguration batchingConfig *BatchingConfig fileUploadConfig *config.FileUpload accessController *AccessController - retryOptions retrytransport.RetryOptions redisClient rd.RDCloser mcpServer *mcpserver.GraphQLSchemaServer processStartTime time.Time @@ -219,7 +218,7 @@ func (c *Config) Usage() map[string]any { usage["file_upload_max_files"] = c.fileUploadConfig.MaxFiles } usage["access_controller"] = c.accessController != nil - usage["retry_options"] = c.retryOptions.Enabled + usage["retry_options"] = c.retryOptions.IsEnabled() usage["development_mode"] = c.developmentMode usage["access_logs"] = c.accessLogsConfig != nil usage["localhost_fallback_inside_docker"] = c.localhostFallbackInsideDocker diff --git a/router/core/supervisor_instance.go b/router/core/supervisor_instance.go index 09605f6d9e..2c9d1b3c86 100644 --- a/router/core/supervisor_instance.go +++ b/router/core/supervisor_instance.go @@ -193,15 +193,7 @@ func optionsFromResources(logger *zap.Logger, config *config.Config) []Option { WithFileUploadConfig(&config.FileUpload), WithSubgraphTransportOptions(NewSubgraphTransportOptions(config.TrafficShaping)), WithSubgraphCircuitBreakerOptions(NewSubgraphCircuitBreakerOptions(config.TrafficShaping)), - WithSubgraphRetryOptions( - config.TrafficShaping.All.BackoffJitterRetry.Enabled, - config.TrafficShaping.All.BackoffJitterRetry.Algorithm, - config.TrafficShaping.All.BackoffJitterRetry.MaxAttempts, - config.TrafficShaping.All.BackoffJitterRetry.MaxDuration, - config.TrafficShaping.All.BackoffJitterRetry.Interval, - config.TrafficShaping.All.BackoffJitterRetry.Expression, - nil, - ), + WithSubgraphRetryOptions(NewSubgraphRetryOptions(config.TrafficShaping)), WithCors(&cors.Config{ Enabled: config.CORS.Enabled, AllowOrigins: config.CORS.AllowOrigins, diff --git a/router/core/transport.go b/router/core/transport.go index 96c838689d..1f6b3c3aa5 100644 --- a/router/core/transport.go +++ b/router/core/transport.go @@ -66,11 +66,11 @@ type sfCacheItem struct { func NewCustomTransport( baseRoundTripper http.RoundTripper, - retryOptions retrytransport.RetryOptions, metricStore metric.Store, connectionMetricStore metric.ConnectionMetricStore, enableSingleFlight bool, breaker *circuit.Manager, + retryManager *retrytransport.Manager, enableTraceClient bool, ) *CustomTransport { ct := &CustomTransport{ @@ -102,8 +102,8 @@ func NewCustomTransport( baseRoundTripper = circuit.NewCircuitTripper(baseRoundTripper, breaker, getRequestContextLogger) } - if retryOptions.Enabled { - ct.roundTripper = retrytransport.NewRetryHTTPTransport(baseRoundTripper, retryOptions, getRequestContextLogger) + if retryManager.IsEnabled() { + ct.roundTripper = retrytransport.NewRetryHTTPTransport(baseRoundTripper, getRequestContextLogger, retryManager) } else { ct.roundTripper = baseRoundTripper } @@ -339,11 +339,11 @@ func (ct *CustomTransport) singleFlightKey(req *http.Request) uint64 { type TransportFactory struct { preHandlers []TransportPreHandler postHandlers []TransportPostHandler - retryOptions retrytransport.RetryOptions localhostFallbackInsideDocker bool metricStore metric.Store connectionMetricStore metric.ConnectionMetricStore circuitBreaker *circuit.Manager + retryManager *retrytransport.Manager logger *zap.Logger tracerProvider *sdktrace.TracerProvider tracePropagators propagation.TextMapPropagator @@ -356,7 +356,6 @@ type TransportOptions struct { PreHandlers []TransportPreHandler PostHandlers []TransportPostHandler SubgraphTransportOptions *SubgraphTransportOptions - RetryOptions retrytransport.RetryOptions LocalhostFallbackInsideDocker bool MetricStore metric.Store ConnectionMetricStore metric.ConnectionMetricStore @@ -365,6 +364,7 @@ type TransportOptions struct { TracerProvider *sdktrace.TracerProvider TracePropagators propagation.TextMapPropagator EnableTraceClient bool + RetryManager *retrytransport.Manager } type SubscriptionClientOptions struct { @@ -378,7 +378,6 @@ func NewTransport(opts *TransportOptions) *TransportFactory { return &TransportFactory{ preHandlers: opts.PreHandlers, postHandlers: opts.PostHandlers, - retryOptions: opts.RetryOptions, localhostFallbackInsideDocker: opts.LocalhostFallbackInsideDocker, metricStore: opts.MetricStore, connectionMetricStore: opts.ConnectionMetricStore, @@ -386,6 +385,7 @@ func NewTransport(opts *TransportOptions) *TransportFactory { tracerProvider: opts.TracerProvider, tracePropagators: opts.TracePropagators, circuitBreaker: opts.CircuitBreaker, + retryManager: opts.RetryManager, enableTraceClient: opts.EnableTraceClient, } } @@ -427,11 +427,11 @@ func (t TransportFactory) RoundTripper(enableSingleFlight bool, baseTransport ht ) tp := NewCustomTransport( traceTransport, - t.retryOptions, t.metricStore, t.connectionMetricStore, enableSingleFlight, t.circuitBreaker, + t.retryManager, t.enableTraceClient, ) diff --git a/router/internal/expr/retry_expression.go b/router/internal/expr/retry_expression.go index 4414d29181..0d49664dd6 100644 --- a/router/internal/expr/retry_expression.go +++ b/router/internal/expr/retry_expression.go @@ -10,13 +10,27 @@ import ( // RetryExpressionManager handles compilation and evaluation of retry expressions type RetryExpressionManager struct { - program *vm.Program + expressionMap map[string]*vm.Program } -// NewRetryExpressionManager creates a new RetryExpressionManager with the given expression -func NewRetryExpressionManager(expression string) (*RetryExpressionManager, error) { +const defaultRetryExpression = "IsRetryableStatusCode() || IsConnectionError() || IsTimeout()" + +// NewRetryExpressionManager creates a new RetryExpressionManager +func NewRetryExpressionManager() *RetryExpressionManager { + return &RetryExpressionManager{ + expressionMap: make(map[string]*vm.Program), + } +} + +func (c *RetryExpressionManager) AddExpression(exprString string) error { + expression := exprString if expression == "" { - return nil, nil + expression = defaultRetryExpression + } + + // The expression has already been processed, skip recompilation + if _, ok := c.expressionMap[expression]; ok { + return nil } // Compile the expression with retry context @@ -27,22 +41,32 @@ func NewRetryExpressionManager(expression string) (*RetryExpressionManager, erro program, err := expr.Compile(expression, options...) if err != nil { - return nil, fmt.Errorf("failed to compile retry expression: %w", handleExpressionError(err)) + return fmt.Errorf("failed to compile retry expression: %w", handleExpressionError(err)) } - return &RetryExpressionManager{ - program: program, - }, nil + // Use the normalized expression string as the key for deduplication + c.expressionMap[expression] = program + return nil } // ShouldRetry evaluates the retry expression with the given context -func (m *RetryExpressionManager) ShouldRetry(ctx RetryContext) (bool, error) { - if m == nil || m.program == nil { - // Use default behavior if no expression is configured +func (m *RetryExpressionManager) ShouldRetry(ctx RetryContext, expressionString string) (bool, error) { + if m == nil { + return false, nil + } + + expression := expressionString + if expression == "" { + expression = defaultRetryExpression + } + + program, ok := m.expressionMap[expression] + if !ok { + // If the expression wasn't pre-compiled, do not retry by default return false, nil } - result, err := expr.Run(m.program, ctx) + result, err := expr.Run(program, ctx) if err != nil { return false, fmt.Errorf("failed to evaluate retry expression: %w", handleExpressionError(err)) } diff --git a/router/internal/expr/retry_expression_test.go b/router/internal/expr/retry_expression_test.go index 04a8c8c268..d57a9d37c5 100644 --- a/router/internal/expr/retry_expression_test.go +++ b/router/internal/expr/retry_expression_test.go @@ -122,7 +122,8 @@ func TestRetryExpressionManager(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - manager, err := NewRetryExpressionManager(tt.expression) + manager := NewRetryExpressionManager() + err := manager.AddExpression(tt.expression) if tt.expectErr { assert.Error(t, err) return @@ -130,7 +131,7 @@ func TestRetryExpressionManager(t *testing.T) { require.NoError(t, err) require.NotNil(t, manager) - result, err := manager.ShouldRetry(tt.ctx) + result, err := manager.ShouldRetry(tt.ctx, tt.expression) assert.NoError(t, err) assert.Equal(t, tt.expected, result) }) @@ -138,9 +139,10 @@ func TestRetryExpressionManager(t *testing.T) { } func TestRetryExpressionManager_EmptyExpression(t *testing.T) { - manager, err := NewRetryExpressionManager("") + manager := NewRetryExpressionManager() + err := manager.AddExpression("") assert.NoError(t, err) - assert.Nil(t, manager) + assert.NotNil(t, manager) } func TestLoadRetryContext(t *testing.T) { @@ -493,56 +495,56 @@ func (e *mockTimeoutError) Temporary() bool { func TestRetryExpressionManager_WithSyscallErrors(t *testing.T) { t.Run("expression with specific syscall error helpers", func(t *testing.T) { expression := "IsConnectionRefused() || IsConnectionReset() || IsTimeout()" - manager, err := NewRetryExpressionManager(expression) - require.NoError(t, err) + manager := NewRetryExpressionManager() + require.NoError(t, manager.AddExpression(expression)) require.NotNil(t, manager) // Test ECONNREFUSED ctx := LoadRetryContext(syscall.ECONNREFUSED, nil) - result, err := manager.ShouldRetry(ctx) + result, err := manager.ShouldRetry(ctx, expression) assert.NoError(t, err) assert.True(t, result) // Test ECONNRESET ctx = LoadRetryContext(syscall.ECONNRESET, nil) - result, err = manager.ShouldRetry(ctx) + result, err = manager.ShouldRetry(ctx, expression) assert.NoError(t, err) assert.True(t, result) // Test ETIMEDOUT ctx = LoadRetryContext(syscall.ETIMEDOUT, nil) - result, err = manager.ShouldRetry(ctx) + result, err = manager.ShouldRetry(ctx, expression) assert.NoError(t, err) assert.True(t, result) // Test unrelated error ctx = LoadRetryContext(errors.New("some other error"), nil) - result, err = manager.ShouldRetry(ctx) + result, err = manager.ShouldRetry(ctx, expression) assert.NoError(t, err) assert.False(t, result) }) t.Run("expression combining status and syscall errors", func(t *testing.T) { expression := "statusCode == 500 || IsConnectionRefused()" - manager, err := NewRetryExpressionManager(expression) - require.NoError(t, err) + manager := NewRetryExpressionManager() + require.NoError(t, manager.AddExpression(expression)) require.NotNil(t, manager) // Test with status code ctx := LoadRetryContext(nil, &http.Response{StatusCode: 500}) - result, err := manager.ShouldRetry(ctx) + result, err := manager.ShouldRetry(ctx, expression) assert.NoError(t, err) assert.True(t, result) // Test with syscall error ctx = LoadRetryContext(syscall.ECONNREFUSED, nil) - result, err = manager.ShouldRetry(ctx) + result, err = manager.ShouldRetry(ctx, expression) assert.NoError(t, err) assert.True(t, result) // Test with neither condition ctx = LoadRetryContext(nil, &http.Response{StatusCode: 200}) - result, err = manager.ShouldRetry(ctx) + result, err = manager.ShouldRetry(ctx, expression) assert.NoError(t, err) assert.False(t, result) }) diff --git a/router/internal/retrytransport/manager.go b/router/internal/retrytransport/manager.go new file mode 100644 index 0000000000..ba6b067d36 --- /dev/null +++ b/router/internal/retrytransport/manager.go @@ -0,0 +1,144 @@ +package retrytransport + +import ( + "errors" + "fmt" + "net/http" + "sync" + "time" + + nodev1 "github.com/wundergraph/cosmo/router/gen/proto/wg/cosmo/node/v1" + "github.com/wundergraph/cosmo/router/internal/expr" + "go.uber.org/zap" +) + +type ( + ShouldRetryFunc func(err error, req *http.Request, resp *http.Response, exprString string) bool + OnRetryFunc func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) + requestLoggerGetter func(req *http.Request) *zap.Logger +) + +const ( + BackoffJitter = "backoff_jitter" +) + +type RetryOptions struct { + Enabled bool + Algorithm string + MaxRetryCount int + Interval time.Duration + MaxDuration time.Duration + Expression string +} + +type Manager struct { + retries map[string]*RetryOptions + lock sync.RWMutex + exprManager *expr.RetryExpressionManager + retryFunc ShouldRetryFunc + OnRetry OnRetryFunc +} + +func NewManager(exprManager *expr.RetryExpressionManager, retryFunc ShouldRetryFunc, onRetryFunc OnRetryFunc) *Manager { + return &Manager{ + retries: make(map[string]*RetryOptions), + exprManager: exprManager, + retryFunc: retryFunc, + OnRetry: onRetryFunc, + } +} + +func (m *Manager) Initialize( + baseRetryOptions RetryOptions, + subgraphRetryOptions map[string]RetryOptions, + subgraphs []*nodev1.Subgraph, +) error { + var joinErr error + + defaultSgNames := make([]string, 0, len(subgraphs)) + customSgNames := make([]string, 0, len(subgraphs)) + + for _, subgraph := range subgraphs { + entry, ok := subgraphRetryOptions[subgraph.Name] + if !ok { + defaultSgNames = append(defaultSgNames, subgraph.Name) + } else if entry.Enabled { + // This will cover the case of if a subgraph is explicitly disabled + customSgNames = append(customSgNames, subgraph.Name) + } + } + + if len(defaultSgNames) > 0 && baseRetryOptions.Enabled { + if baseRetryOptions.Algorithm != BackoffJitter { + joinErr = errors.Join(joinErr, fmt.Errorf("unsupported retry algorithm: %s", baseRetryOptions.Algorithm)) + } else { + err := m.exprManager.AddExpression(baseRetryOptions.Expression) + if err != nil { + joinErr = errors.Join(joinErr, fmt.Errorf("failed to add base retry expression: %w", err)) + } + } + } + + for _, sgName := range defaultSgNames { + m.retries[sgName] = &baseRetryOptions + } + + for _, sgName := range customSgNames { + entry, ok := subgraphRetryOptions[sgName] + if !ok { + joinErr = errors.Join(joinErr, errors.New("retry config not found for subgraph "+sgName)) + continue + } + + if entry.Algorithm != BackoffJitter { + joinErr = errors.Join(joinErr, fmt.Errorf("unsupported retry algorithm: %s", baseRetryOptions.Algorithm)) + continue + } + + m.retries[sgName] = &entry + + err := m.exprManager.AddExpression(entry.Expression) + if err != nil { + joinErr = errors.Join(joinErr, errors.New("retry expression did not get added "+sgName)) + continue + } + } + + return joinErr +} + +func (c *Manager) GetSubgraphOptions(name string) *RetryOptions { + if c == nil { + return nil + } + + c.lock.RLock() + defer c.lock.RUnlock() + + if circuitBreaker, ok := c.retries[name]; ok { + return circuitBreaker + } + return nil +} + +func (c *Manager) IsEnabled() bool { + if c == nil { + return false + } + + c.lock.RLock() + defer c.lock.RUnlock() + + return len(c.retries) > 0 +} + +func (c *Manager) Retry(err error, req *http.Request, resp *http.Response, exprString string) bool { + return c.retryFunc(err, req, resp, exprString) +} + +// OnRetryHook triggers the configured OnRetry callback, if any. +func (c *Manager) OnRetryHook(count int, err error, req *http.Request, resp *http.Response, sleepDuration time.Duration) { + if c.OnRetry != nil { + c.OnRetry(count, req, resp, sleepDuration, err) + } +} diff --git a/router/internal/retrytransport/retry_transport.go b/router/internal/retrytransport/retry_transport.go index da0b262bea..43f60364a5 100644 --- a/router/internal/retrytransport/retry_transport.go +++ b/router/internal/retrytransport/retry_transport.go @@ -7,32 +7,16 @@ import ( "strconv" "time" + rcontext "github.com/wundergraph/cosmo/router/internal/context" + "github.com/cloudflare/backoff" "go.uber.org/zap" ) -type ShouldRetryFunc func(err error, req *http.Request, resp *http.Response) bool -type OnRetryFunc func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) - -type RetryOptions struct { - Enabled bool - Algorithm string - MaxRetryCount int - Interval time.Duration - MaxDuration time.Duration - Expression string - ShouldRetry ShouldRetryFunc - - // Test specific only - OnRetry OnRetryFunc -} - -type requestLoggerGetter func(req *http.Request) *zap.Logger - type RetryHTTPTransport struct { - RoundTripper http.RoundTripper - RetryOptions RetryOptions + roundTripper http.RoundTripper getRequestLogger requestLoggerGetter + retryManager *Manager } // parseRetryAfterHeader parses the Retry-After header value according to RFC 7231. @@ -92,36 +76,50 @@ func shouldUseRetryAfter(logger *zap.Logger, resp *http.Response, maxDuration ti func NewRetryHTTPTransport( roundTripper http.RoundTripper, - retryOptions RetryOptions, getRequestLogger requestLoggerGetter, + retryManager *Manager, ) *RetryHTTPTransport { return &RetryHTTPTransport{ - RoundTripper: roundTripper, - RetryOptions: retryOptions, + roundTripper: roundTripper, getRequestLogger: getRequestLogger, + retryManager: retryManager, } } func (rt *RetryHTTPTransport) RoundTrip(req *http.Request) (*http.Response, error) { - resp, err := rt.RoundTripper.RoundTrip(req) + resp, err := rt.roundTripper.RoundTrip(req) // Short circuit if the request was successful. if err == nil && isResponseOK(resp) { return resp, nil } - b := backoff.New(rt.RetryOptions.MaxDuration, rt.RetryOptions.Interval) + var subgraph string + subgraphCtxVal := req.Context().Value(rcontext.CurrentSubgraphContextKey{}) + if subgraphCtxVal != nil { + if sg, ok := subgraphCtxVal.(string); ok { + subgraph = sg + } + } + + // If there is no option defined for this subgraph + retryOptions := rt.retryManager.GetSubgraphOptions(subgraph) + if retryOptions == nil { + return rt.roundTripper.RoundTrip(req) + } + + b := backoff.New(retryOptions.MaxDuration, retryOptions.Interval) defer b.Reset() requestLogger := rt.getRequestLogger(req) // Retry logic retries := 0 - for (rt.RetryOptions.ShouldRetry(err, req, resp)) && retries < rt.RetryOptions.MaxRetryCount { + for (rt.retryManager.Retry(err, req, resp, retryOptions.Expression)) && retries < retryOptions.MaxRetryCount { retries++ // Check if we should use Retry-After header for 429 responses var sleepDuration time.Duration - if retryAfterDuration, useRetryAfter := shouldUseRetryAfter(requestLogger, resp, rt.RetryOptions.MaxDuration); useRetryAfter { + if retryAfterDuration, useRetryAfter := shouldUseRetryAfter(requestLogger, resp, retryOptions.MaxDuration); useRetryAfter { sleepDuration = retryAfterDuration requestLogger.Debug("Using Retry-After header for 429 response", zap.Int("retry", retries), @@ -138,10 +136,8 @@ func (rt *RetryHTTPTransport) RoundTrip(req *http.Request) (*http.Response, erro ) } - // Test Specific - if rt.RetryOptions.OnRetry != nil { - rt.RetryOptions.OnRetry(retries, req, resp, sleepDuration, err) - } + // Notify hook before sleeping + rt.retryManager.OnRetryHook(retries, err, req, resp, sleepDuration) // Wait for the specified duration time.Sleep(sleepDuration) @@ -150,7 +146,7 @@ func (rt *RetryHTTPTransport) RoundTrip(req *http.Request) (*http.Response, erro rt.drainBody(resp, requestLogger) // Retry the request - resp, err = rt.RoundTripper.RoundTrip(req) + resp, err = rt.roundTripper.RoundTrip(req) // Short circuit if the request was successful if err == nil && isResponseOK(resp) { diff --git a/router/internal/retrytransport/retry_transport_test.go b/router/internal/retrytransport/retry_transport_test.go index e2d49d3f28..5aa9163d49 100644 --- a/router/internal/retrytransport/retry_transport_test.go +++ b/router/internal/retrytransport/retry_transport_test.go @@ -2,6 +2,7 @@ package retrytransport import ( "bytes" + "context" "errors" "fmt" "io" @@ -15,6 +16,9 @@ import ( "github.com/stretchr/testify/assert" "go.uber.org/zap" + + rcontext "github.com/wundergraph/cosmo/router/internal/context" + "github.com/wundergraph/cosmo/router/internal/expr" ) const defaultMaxDuration = 100 * time.Second @@ -51,13 +55,31 @@ func (dt *MockTransport) RoundTrip(req *http.Request) (*http.Response, error) { return dt.handler(req) } +func withSubgraph(req *http.Request, name string) *http.Request { + ctx := context.WithValue(req.Context(), rcontext.CurrentSubgraphContextKey{}, name) + return req.WithContext(ctx) +} + +func newTestManager(shouldRetry func(error, *http.Request, *http.Response) bool, onRetry OnRetryFunc, opts RetryOptions, subgraphName string) *Manager { + mgr := NewManager(expr.NewRetryExpressionManager(), func(err error, req *http.Request, resp *http.Response, exprString string) bool { + return shouldRetry(err, req, resp) + }, onRetry) + // attach options for the subgraph + mgr.retries[subgraphName] = &opts + return mgr +} + func TestRetryOnHTTP5xx(t *testing.T) { retries := 0 attemptCount := 0 maxRetries := 3 - tr := RetryHTTPTransport{ - RoundTripper: &MockTransport{ + mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + retries++ + }, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg") + + tr := NewRetryHTTPTransport( + &MockTransport{ handler: func(req *http.Request) (*http.Response, error) { attemptCount++ if attemptCount <= maxRetries { @@ -72,21 +94,11 @@ func TestRetryOnHTTP5xx(t *testing.T) { }, nil }, }, - getRequestLogger: func(req *http.Request) *zap.Logger { - return zap.NewNop() - }, - RetryOptions: RetryOptions{ - MaxRetryCount: maxRetries, - Interval: 1 * time.Millisecond, - MaxDuration: 10 * time.Millisecond, - ShouldRetry: simpleShouldRetry, - OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { - retries++ - }, - }, - } + func(req *http.Request) *zap.Logger { return zap.NewNop() }, + mgr, + ) - req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) + req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") resp, err := tr.RoundTrip(req) assert.Nil(t, err) @@ -102,8 +114,12 @@ func TestRetryOnErrors(t *testing.T) { attemptCount := 0 maxRetries := 3 - tr := RetryHTTPTransport{ - RoundTripper: &MockTransport{ + mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + retries++ + }, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg") + + tr := NewRetryHTTPTransport( + &MockTransport{ handler: func(req *http.Request) (*http.Response, error) { attemptCount++ if attemptCount <= maxRetries { @@ -116,21 +132,11 @@ func TestRetryOnErrors(t *testing.T) { }, nil }, }, - getRequestLogger: func(req *http.Request) *zap.Logger { - return zap.NewNop() - }, - RetryOptions: RetryOptions{ - MaxRetryCount: maxRetries, - Interval: 1 * time.Millisecond, - MaxDuration: 10 * time.Millisecond, - ShouldRetry: simpleShouldRetry, - OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { - retries++ - }, - }, - } + func(req *http.Request) *zap.Logger { return zap.NewNop() }, + mgr, + ) - req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) + req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") resp, err := tr.RoundTrip(req) assert.Nil(t, err) @@ -156,8 +162,12 @@ func TestDoNotRetryWhenShouldRetryReturnsFalse(t *testing.T) { return simpleShouldRetry(err, req, resp) } - tr := RetryHTTPTransport{ - RoundTripper: &MockTransport{ + mgr := newTestManager(shouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + retries++ + }, RetryOptions{MaxRetryCount: maxRetryCount, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg") + + tr := NewRetryHTTPTransport( + &MockTransport{ handler: func(req *http.Request) (*http.Response, error) { attemptCount++ switch attemptCount { @@ -173,21 +183,11 @@ func TestDoNotRetryWhenShouldRetryReturnsFalse(t *testing.T) { } }, }, - getRequestLogger: func(req *http.Request) *zap.Logger { - return zap.NewNop() - }, - RetryOptions: RetryOptions{ - MaxRetryCount: maxRetryCount, - Interval: 1 * time.Millisecond, - MaxDuration: 10 * time.Millisecond, - ShouldRetry: shouldRetry, - OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { - retries++ - }, - }, - } + func(req *http.Request) *zap.Logger { return zap.NewNop() }, + mgr, + ) - req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) + req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") resp, err := tr.RoundTrip(req) assert.Error(t, err) @@ -232,8 +232,12 @@ func (b *TrackableBody) Close() error { func TestShortCircuitOnSuccess(t *testing.T) { attemptCount := 0 - tr := RetryHTTPTransport{ - RoundTripper: &MockTransport{ + mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + t.Error("OnRetry should not be called when first request succeeds") + }, RetryOptions{MaxRetryCount: 5, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg") + + tr := NewRetryHTTPTransport( + &MockTransport{ handler: func(req *http.Request) (*http.Response, error) { attemptCount++ // Always return success @@ -243,21 +247,11 @@ func TestShortCircuitOnSuccess(t *testing.T) { }, nil }, }, - getRequestLogger: func(req *http.Request) *zap.Logger { - return zap.NewNop() - }, - RetryOptions: RetryOptions{ - MaxRetryCount: 5, - Interval: 1 * time.Millisecond, - MaxDuration: 10 * time.Millisecond, - ShouldRetry: simpleShouldRetry, - OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { - t.Error("OnRetry should not be called when first request succeeds") - }, - }, - } + func(req *http.Request) *zap.Logger { return zap.NewNop() }, + mgr, + ) - req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) + req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") resp, err := tr.RoundTrip(req) assert.NoError(t, err) @@ -276,29 +270,23 @@ func TestMaxRetryCountRespected(t *testing.T) { retries := 0 attemptCount := 0 - tr := RetryHTTPTransport{ - RoundTripper: &MockTransport{ + mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + retries++ + }, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg") + + tr := NewRetryHTTPTransport( + &MockTransport{ handler: func(req *http.Request) (*http.Response, error) { attemptCount++ // Always return retryable error to test max retry limit return nil, errors.New("always fail") }, }, - getRequestLogger: func(req *http.Request) *zap.Logger { - return zap.NewNop() - }, - RetryOptions: RetryOptions{ - MaxRetryCount: maxRetries, - Interval: 1 * time.Millisecond, - MaxDuration: 10 * time.Millisecond, - ShouldRetry: simpleShouldRetry, - OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { - retries++ - }, - }, - } + func(req *http.Request) *zap.Logger { return zap.NewNop() }, + mgr, + ) - req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) + req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") resp, err := tr.RoundTrip(req) assert.Error(t, err) @@ -323,8 +311,12 @@ func TestResponseBodyDraining(t *testing.T) { } } - tr := RetryHTTPTransport{ - RoundTripper: &MockTransport{ + mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + actualRetries++ + }, RetryOptions{MaxRetryCount: retryCount, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg") + + tr := NewRetryHTTPTransport( + &MockTransport{ handler: func(req *http.Request) (*http.Response, error) { index++ if index < retryCount { @@ -340,21 +332,11 @@ func TestResponseBodyDraining(t *testing.T) { } }, }, - getRequestLogger: func(req *http.Request) *zap.Logger { - return zap.NewNop() - }, - RetryOptions: RetryOptions{ - MaxRetryCount: retryCount, - Interval: 1 * time.Millisecond, - MaxDuration: 10 * time.Millisecond, - ShouldRetry: simpleShouldRetry, - OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { - actualRetries++ - }, - }, - } + func(req *http.Request) *zap.Logger { return zap.NewNop() }, + mgr, + ) - req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) + req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") resp, err := tr.RoundTrip(req) assert.Nil(t, err) @@ -398,8 +380,12 @@ func TestRequestLoggerIsUsed(t *testing.T) { bodies[i] = trackableBody } - tr := RetryHTTPTransport{ - RoundTripper: &MockTransport{ + mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + actualRetries++ + }, RetryOptions{MaxRetryCount: retryCount, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg") + + tr := NewRetryHTTPTransport( + &MockTransport{ handler: func(req *http.Request) (*http.Response, error) { index++ if index < retryCount { @@ -415,21 +401,11 @@ func TestRequestLoggerIsUsed(t *testing.T) { } }, }, - getRequestLogger: func(req *http.Request) *zap.Logger { - return requestLogger - }, - RetryOptions: RetryOptions{ - MaxRetryCount: retryCount, - Interval: 1 * time.Millisecond, - MaxDuration: 10 * time.Millisecond, - ShouldRetry: simpleShouldRetry, - OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { - actualRetries++ - }, - }, - } + func(req *http.Request) *zap.Logger { return requestLogger }, + mgr, + ) - req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) + req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") tr.RoundTrip(req) @@ -459,8 +435,17 @@ func TestOnRetryCallbackInvoked(t *testing.T) { resp *http.Response } - tr := RetryHTTPTransport{ - RoundTripper: &MockTransport{ + mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + retries++ + retryCallbacks = append(retryCallbacks, struct { + count int + err error + resp *http.Response + }{count: count, err: err, resp: resp}) + }, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg") + + tr := NewRetryHTTPTransport( + &MockTransport{ handler: func(req *http.Request) (*http.Response, error) { if retries < maxRetries { // Return retryable error @@ -472,26 +457,11 @@ func TestOnRetryCallbackInvoked(t *testing.T) { }, nil }, }, - getRequestLogger: func(req *http.Request) *zap.Logger { - return zap.NewNop() - }, - RetryOptions: RetryOptions{ - MaxRetryCount: maxRetries, - Interval: 1 * time.Millisecond, - MaxDuration: 10 * time.Millisecond, - ShouldRetry: simpleShouldRetry, - OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { - retries++ - retryCallbacks = append(retryCallbacks, struct { - count int - err error - resp *http.Response - }{count: count, err: err, resp: resp}) - }, - }, - } + func(req *http.Request) *zap.Logger { return zap.NewNop() }, + mgr, + ) - req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) + req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") resp, err := tr.RoundTrip(req) assert.NoError(t, err) @@ -519,8 +489,12 @@ func TestRetryOn429WithDelaySeconds(t *testing.T) { // Track what retry duration was requested to verify Retry-After is parsed correctly var retryAfterUsed []bool - tr := RetryHTTPTransport{ - RoundTripper: &MockTransport{ + mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + retries++ + }, RetryOptions{MaxRetryCount: maxRetries, Interval: 100 * time.Millisecond, MaxDuration: 10 * time.Second}, "sg") + + tr := NewRetryHTTPTransport( + &MockTransport{ handler: func(req *http.Request) (*http.Response, error) { attemptCount++ if attemptCount <= maxRetries { @@ -545,21 +519,11 @@ func TestRetryOn429WithDelaySeconds(t *testing.T) { }, nil }, }, - getRequestLogger: func(req *http.Request) *zap.Logger { - return zap.NewNop() - }, - RetryOptions: RetryOptions{ - MaxRetryCount: maxRetries, - Interval: 100 * time.Millisecond, // This should be ignored for 429 - MaxDuration: 10 * time.Second, - ShouldRetry: shouldRetryWith429, - OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { - retries++ - }, - }, - } + func(req *http.Request) *zap.Logger { return zap.NewNop() }, + mgr, + ) - req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) + req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") resp, err := tr.RoundTrip(req) assert.NoError(t, err) @@ -583,8 +547,12 @@ func TestRetryOn429WithDelaySecondsLargerThanMaxDuration(t *testing.T) { // Track what retry duration was requested to verify Retry-After is parsed correctly var retryAfterUsed []bool - tr := RetryHTTPTransport{ - RoundTripper: &MockTransport{ + mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + retries++ + }, RetryOptions{MaxRetryCount: maxRetries, Interval: 100 * time.Millisecond, MaxDuration: 10 * time.Second}, "sg") + + tr := NewRetryHTTPTransport( + &MockTransport{ handler: func(req *http.Request) (*http.Response, error) { attemptCount++ if attemptCount <= maxRetries { @@ -609,21 +577,11 @@ func TestRetryOn429WithDelaySecondsLargerThanMaxDuration(t *testing.T) { }, nil }, }, - getRequestLogger: func(req *http.Request) *zap.Logger { - return zap.NewNop() - }, - RetryOptions: RetryOptions{ - MaxRetryCount: maxRetries, - Interval: 100 * time.Millisecond, // This should be ignored for 429 - MaxDuration: 10 * time.Second, - ShouldRetry: shouldRetryWith429, - OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { - retries++ - }, - }, - } + func(req *http.Request) *zap.Logger { return zap.NewNop() }, + mgr, + ) - req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) + req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") resp, err := tr.RoundTrip(req) assert.NoError(t, err) @@ -642,8 +600,12 @@ func TestRetryOn429WithoutRetryAfter(t *testing.T) { attemptCount := 0 maxRetries := 2 - tr := RetryHTTPTransport{ - RoundTripper: &MockTransport{ + mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + retries++ + }, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg") + + tr := NewRetryHTTPTransport( + &MockTransport{ handler: func(req *http.Request) (*http.Response, error) { attemptCount++ if attemptCount <= maxRetries { @@ -659,21 +621,11 @@ func TestRetryOn429WithoutRetryAfter(t *testing.T) { }, nil }, }, - getRequestLogger: func(req *http.Request) *zap.Logger { - return zap.NewNop() - }, - RetryOptions: RetryOptions{ - MaxRetryCount: maxRetries, - Interval: 1 * time.Millisecond, - MaxDuration: 10 * time.Millisecond, - ShouldRetry: shouldRetryWith429, - OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { - retries++ - }, - }, - } + func(req *http.Request) *zap.Logger { return zap.NewNop() }, + mgr, + ) - req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) + req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") resp, err := tr.RoundTrip(req) assert.NoError(t, err) @@ -692,8 +644,12 @@ func TestRetryOn429WithHTTPDate(t *testing.T) { var retryAfterUsed []bool var expectedDuration time.Duration - tr := RetryHTTPTransport{ - RoundTripper: &MockTransport{ + mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + retries++ + }, RetryOptions{MaxRetryCount: maxRetries, Interval: 100 * time.Millisecond, MaxDuration: 10 * time.Second}, "sg") + + tr := NewRetryHTTPTransport( + &MockTransport{ handler: func(req *http.Request) (*http.Response, error) { attemptCount++ if attemptCount <= maxRetries { @@ -722,21 +678,11 @@ func TestRetryOn429WithHTTPDate(t *testing.T) { }, nil }, }, - getRequestLogger: func(req *http.Request) *zap.Logger { - return zap.NewNop() - }, - RetryOptions: RetryOptions{ - MaxRetryCount: maxRetries, - Interval: 100 * time.Millisecond, // This should be ignored for 429 - MaxDuration: 10 * time.Second, - ShouldRetry: shouldRetryWith429, - OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { - retries++ - }, - }, - } + func(req *http.Request) *zap.Logger { return zap.NewNop() }, + mgr, + ) - req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) + req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") resp, err := tr.RoundTrip(req) assert.NoError(t, err) @@ -755,8 +701,12 @@ func TestRetryOn429WithInvalidRetryAfterHeader(t *testing.T) { attemptCount := 0 maxRetries := 2 - tr := RetryHTTPTransport{ - RoundTripper: &MockTransport{ + mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + retries++ + }, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg") + + tr := NewRetryHTTPTransport( + &MockTransport{ handler: func(req *http.Request) (*http.Response, error) { attemptCount++ if attemptCount <= maxRetries { @@ -774,21 +724,11 @@ func TestRetryOn429WithInvalidRetryAfterHeader(t *testing.T) { }, nil }, }, - getRequestLogger: func(req *http.Request) *zap.Logger { - return zap.NewNop() - }, - RetryOptions: RetryOptions{ - MaxRetryCount: maxRetries, - Interval: 1 * time.Millisecond, - MaxDuration: 10 * time.Millisecond, - ShouldRetry: shouldRetryWith429, - OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { - retries++ - }, - }, - } + func(req *http.Request) *zap.Logger { return zap.NewNop() }, + mgr, + ) - req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) + req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") resp, err := tr.RoundTrip(req) assert.NoError(t, err) @@ -804,8 +744,12 @@ func TestRetryOn429WithNegativeDelaySeconds(t *testing.T) { attemptCount := 0 maxRetries := 2 - tr := RetryHTTPTransport{ - RoundTripper: &MockTransport{ + mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + retries++ + }, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg") + + tr := NewRetryHTTPTransport( + &MockTransport{ handler: func(req *http.Request) (*http.Response, error) { attemptCount++ if attemptCount <= maxRetries { @@ -823,21 +767,11 @@ func TestRetryOn429WithNegativeDelaySeconds(t *testing.T) { }, nil }, }, - getRequestLogger: func(req *http.Request) *zap.Logger { - return zap.NewNop() - }, - RetryOptions: RetryOptions{ - MaxRetryCount: maxRetries, - Interval: 1 * time.Millisecond, - MaxDuration: 10 * time.Millisecond, - ShouldRetry: shouldRetryWith429, - OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { - retries++ - }, - }, - } + func(req *http.Request) *zap.Logger { return zap.NewNop() }, + mgr, + ) - req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) + req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") resp, err := tr.RoundTrip(req) assert.NoError(t, err) @@ -855,8 +789,12 @@ func TestRetryMixed429AndOtherErrors(t *testing.T) { // Track which responses used Retry-After vs normal backoff var retryAfterUsedPerAttempt []bool - tr := RetryHTTPTransport{ - RoundTripper: &MockTransport{ + mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + retries++ + }, RetryOptions{MaxRetryCount: maxRetries, Interval: 10 * time.Millisecond, MaxDuration: 10 * time.Second}, "sg") + + tr := NewRetryHTTPTransport( + &MockTransport{ handler: func(req *http.Request) (*http.Response, error) { attemptCount++ switch attemptCount { @@ -902,21 +840,11 @@ func TestRetryMixed429AndOtherErrors(t *testing.T) { } }, }, - getRequestLogger: func(req *http.Request) *zap.Logger { - return zap.NewNop() - }, - RetryOptions: RetryOptions{ - MaxRetryCount: maxRetries, - Interval: 10 * time.Millisecond, // Should be used for non-429-with-Retry-After cases - MaxDuration: 10 * time.Second, - ShouldRetry: shouldRetryWith429, - OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { - retries++ - }, - }, - } + func(req *http.Request) *zap.Logger { return zap.NewNop() }, + mgr, + ) - req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) + req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") resp, err := tr.RoundTrip(req) assert.NoError(t, err) @@ -945,8 +873,12 @@ func TestNoRetryOn429WhenShouldRetryReturnsFalse(t *testing.T) { return err != nil } - tr := RetryHTTPTransport{ - RoundTripper: &MockTransport{ + mgr := newTestManager(shouldNotRetry429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + retries++ + }, RetryOptions{MaxRetryCount: 3, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg") + + tr := NewRetryHTTPTransport( + &MockTransport{ handler: func(req *http.Request) (*http.Response, error) { attemptCount++ // Always return 429 with Retry-After header @@ -958,21 +890,11 @@ func TestNoRetryOn429WhenShouldRetryReturnsFalse(t *testing.T) { return resp, nil }, }, - getRequestLogger: func(req *http.Request) *zap.Logger { - return zap.NewNop() - }, - RetryOptions: RetryOptions{ - MaxRetryCount: 3, - Interval: 1 * time.Millisecond, - MaxDuration: 10 * time.Millisecond, - ShouldRetry: shouldNotRetry429, - OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { - retries++ - }, - }, - } + func(req *http.Request) *zap.Logger { return zap.NewNop() }, + mgr, + ) - req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) + req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") resp, err := tr.RoundTrip(req) assert.NoError(t, err) From 39a200c8aeabfaccc6f220f014c9a214cb035b14 Mon Sep 17 00:00:00 2001 From: Milinda Dias Date: Mon, 1 Sep 2025 19:07:49 +0530 Subject: [PATCH 02/24] fix: review comments --- router/internal/retrytransport/manager.go | 39 +++++++++++-------- .../retrytransport/retry_transport.go | 2 +- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/router/internal/retrytransport/manager.go b/router/internal/retrytransport/manager.go index ba6b067d36..8121c619da 100644 --- a/router/internal/retrytransport/manager.go +++ b/router/internal/retrytransport/manager.go @@ -53,6 +53,9 @@ func (m *Manager) Initialize( subgraphRetryOptions map[string]RetryOptions, subgraphs []*nodev1.Subgraph, ) error { + m.lock.Lock() + defer m.lock.Unlock() + var joinErr error defaultSgNames := make([]string, 0, len(subgraphs)) @@ -95,7 +98,8 @@ func (m *Manager) Initialize( continue } - m.retries[sgName] = &entry + opts := entry + m.retries[sgName] = &opts err := m.exprManager.AddExpression(entry.Expression) if err != nil { @@ -107,38 +111,41 @@ func (m *Manager) Initialize( return joinErr } -func (c *Manager) GetSubgraphOptions(name string) *RetryOptions { - if c == nil { +func (m *Manager) GetSubgraphOptions(name string) *RetryOptions { + if m == nil { return nil } - c.lock.RLock() - defer c.lock.RUnlock() + m.lock.RLock() + defer m.lock.RUnlock() - if circuitBreaker, ok := c.retries[name]; ok { + if circuitBreaker, ok := m.retries[name]; ok { return circuitBreaker } return nil } -func (c *Manager) IsEnabled() bool { - if c == nil { +func (m *Manager) IsEnabled() bool { + if m == nil { return false } - c.lock.RLock() - defer c.lock.RUnlock() + m.lock.RLock() + defer m.lock.RUnlock() - return len(c.retries) > 0 + return len(m.retries) > 0 } -func (c *Manager) Retry(err error, req *http.Request, resp *http.Response, exprString string) bool { - return c.retryFunc(err, req, resp, exprString) +func (m *Manager) Retry(err error, req *http.Request, resp *http.Response, exprString string) bool { + if m.retryFunc == nil { + return false + } + return m.retryFunc(err, req, resp, exprString) } // OnRetryHook triggers the configured OnRetry callback, if any. -func (c *Manager) OnRetryHook(count int, err error, req *http.Request, resp *http.Response, sleepDuration time.Duration) { - if c.OnRetry != nil { - c.OnRetry(count, req, resp, sleepDuration, err) +func (m *Manager) OnRetryHook(count int, err error, req *http.Request, resp *http.Response, sleepDuration time.Duration) { + if m.OnRetry != nil { + m.OnRetry(count, req, resp, sleepDuration, err) } } diff --git a/router/internal/retrytransport/retry_transport.go b/router/internal/retrytransport/retry_transport.go index 43f60364a5..8258e263d0 100644 --- a/router/internal/retrytransport/retry_transport.go +++ b/router/internal/retrytransport/retry_transport.go @@ -104,7 +104,7 @@ func (rt *RetryHTTPTransport) RoundTrip(req *http.Request) (*http.Response, erro // If there is no option defined for this subgraph retryOptions := rt.retryManager.GetSubgraphOptions(subgraph) if retryOptions == nil { - return rt.roundTripper.RoundTrip(req) + return resp, nil } b := backoff.New(retryOptions.MaxDuration, retryOptions.Interval) From a16a8f971d26bf1fd084855c9620612f7097124b Mon Sep 17 00:00:00 2001 From: Milinda Dias Date: Mon, 1 Sep 2025 19:10:18 +0530 Subject: [PATCH 03/24] fix: add test --- router-tests/retry_test.go | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/router-tests/retry_test.go b/router-tests/retry_test.go index 010b3ab1d8..4ce9be0bca 100644 --- a/router-tests/retry_test.go +++ b/router-tests/retry_test.go @@ -530,6 +530,33 @@ func TestRetryPerSubgraph(t *testing.T) { require.ErrorContains(t, err, "unsupported retry algorithm") }) + t.Run("invalid algorithm is ignored when retries are disabled (base)", func(t *testing.T) { + t.Parallel() + + opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{ + All: config.GlobalSubgraphRequestRule{ + BackoffJitterRetry: config.BackoffJitterRetry{ + Enabled: false, + Algorithm: "invalid_algorithm", + MaxAttempts: 2, + MaxDuration: 2 * time.Second, + Interval: 10 * time.Millisecond, + Expression: "true", + }, + }, + }) + options := core.WithSubgraphRetryOptions(opts) + + err := testenv.RunWithError(t, &testenv.Config{ + NoRetryClient: true, + RouterOptions: []core.Option{ + options, + }, + }, func(t *testing.T, xEnv *testenv.Environment) {}) + + require.NoError(t, err) + }) + t.Run("verify invalid algorithm is detected for per subgraphs", func(t *testing.T) { t.Parallel() From a8a9cc2aa781438a667d8852b3f6e6f92b3e20df Mon Sep 17 00:00:00 2001 From: Milinda Dias Date: Mon, 1 Sep 2025 19:12:32 +0530 Subject: [PATCH 04/24] fix: tests --- router-tests/retry_test.go | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/router-tests/retry_test.go b/router-tests/retry_test.go index 4ce9be0bca..9df5208fad 100644 --- a/router-tests/retry_test.go +++ b/router-tests/retry_test.go @@ -557,6 +557,35 @@ func TestRetryPerSubgraph(t *testing.T) { require.NoError(t, err) }) + t.Run("invalid algorithm is ignored when retries are disabled (per subgraph)", func(t *testing.T) { + t.Parallel() + + opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{ + Subgraphs: map[string]config.GlobalSubgraphRequestRule{ + "employees": { + BackoffJitterRetry: config.BackoffJitterRetry{ + Enabled: false, + Algorithm: "invalid_algorithm", + MaxAttempts: 2, + MaxDuration: 2 * time.Second, + Interval: 10 * time.Millisecond, + Expression: "true", + }, + }, + }, + }) + options := core.WithSubgraphRetryOptions(opts) + + err := testenv.RunWithError(t, &testenv.Config{ + NoRetryClient: true, + RouterOptions: []core.Option{ + options, + }, + }, func(t *testing.T, xEnv *testenv.Environment) {}) + + require.NoError(t, err) + }) + t.Run("verify invalid algorithm is detected for per subgraphs", func(t *testing.T) { t.Parallel() From 79cdb75b3c2d3767e7ae7f062f834532e1ce5922 Mon Sep 17 00:00:00 2001 From: Milinda Dias Date: Mon, 1 Sep 2025 19:19:56 +0530 Subject: [PATCH 05/24] fix: add default options to subgraphs only when required --- router/internal/retrytransport/manager.go | 24 ++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/router/internal/retrytransport/manager.go b/router/internal/retrytransport/manager.go index 8121c619da..01c0327f30 100644 --- a/router/internal/retrytransport/manager.go +++ b/router/internal/retrytransport/manager.go @@ -71,6 +71,7 @@ func (m *Manager) Initialize( } } + // First validate and add expressions for base retry options if needed if len(defaultSgNames) > 0 && baseRetryOptions.Enabled { if baseRetryOptions.Algorithm != BackoffJitter { joinErr = errors.Join(joinErr, fmt.Errorf("unsupported retry algorithm: %s", baseRetryOptions.Algorithm)) @@ -78,14 +79,17 @@ func (m *Manager) Initialize( err := m.exprManager.AddExpression(baseRetryOptions.Expression) if err != nil { joinErr = errors.Join(joinErr, fmt.Errorf("failed to add base retry expression: %w", err)) + } else { + // Only assign default options if validation succeeds + for _, sgName := range defaultSgNames { + opts := baseRetryOptions + m.retries[sgName] = &opts + } } } } - for _, sgName := range defaultSgNames { - m.retries[sgName] = &baseRetryOptions - } - + // Process custom retry options for _, sgName := range customSgNames { entry, ok := subgraphRetryOptions[sgName] if !ok { @@ -94,18 +98,20 @@ func (m *Manager) Initialize( } if entry.Algorithm != BackoffJitter { - joinErr = errors.Join(joinErr, fmt.Errorf("unsupported retry algorithm: %s", baseRetryOptions.Algorithm)) + joinErr = errors.Join(joinErr, fmt.Errorf("unsupported retry algorithm: %s", entry.Algorithm)) continue } - opts := entry - m.retries[sgName] = &opts - + // Validate expression before assigning options err := m.exprManager.AddExpression(entry.Expression) if err != nil { - joinErr = errors.Join(joinErr, errors.New("retry expression did not get added "+sgName)) + joinErr = errors.Join(joinErr, fmt.Errorf("failed to add retry expression for subgraph %s: %w", sgName, err)) continue } + + // Create a new copy of the options + opts := entry + m.retries[sgName] = &opts } return joinErr From 33446317078bde578480b27bd07dea0765a800fd Mon Sep 17 00:00:00 2001 From: Milinda Dias Date: Mon, 1 Sep 2025 19:34:38 +0530 Subject: [PATCH 06/24] fix: review comments --- router/internal/retrytransport/manager.go | 7 ------- router/internal/retrytransport/retry_transport.go | 8 +++++--- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/router/internal/retrytransport/manager.go b/router/internal/retrytransport/manager.go index 01c0327f30..2cab10bae9 100644 --- a/router/internal/retrytransport/manager.go +++ b/router/internal/retrytransport/manager.go @@ -148,10 +148,3 @@ func (m *Manager) Retry(err error, req *http.Request, resp *http.Response, exprS } return m.retryFunc(err, req, resp, exprString) } - -// OnRetryHook triggers the configured OnRetry callback, if any. -func (m *Manager) OnRetryHook(count int, err error, req *http.Request, resp *http.Response, sleepDuration time.Duration) { - if m.OnRetry != nil { - m.OnRetry(count, req, resp, sleepDuration, err) - } -} diff --git a/router/internal/retrytransport/retry_transport.go b/router/internal/retrytransport/retry_transport.go index 8258e263d0..83cdae16c4 100644 --- a/router/internal/retrytransport/retry_transport.go +++ b/router/internal/retrytransport/retry_transport.go @@ -104,7 +104,7 @@ func (rt *RetryHTTPTransport) RoundTrip(req *http.Request) (*http.Response, erro // If there is no option defined for this subgraph retryOptions := rt.retryManager.GetSubgraphOptions(subgraph) if retryOptions == nil { - return resp, nil + return resp, err } b := backoff.New(retryOptions.MaxDuration, retryOptions.Interval) @@ -136,8 +136,10 @@ func (rt *RetryHTTPTransport) RoundTrip(req *http.Request) (*http.Response, erro ) } - // Notify hook before sleeping - rt.retryManager.OnRetryHook(retries, err, req, resp, sleepDuration) + // A hook used for testing + if rt.retryManager.OnRetry != nil { + rt.retryManager.OnRetry(retries, req, resp, sleepDuration, err) + } // Wait for the specified duration time.Sleep(sleepDuration) From e3cebe5f3c4a04880218fd16e1377e68bc17845b Mon Sep 17 00:00:00 2001 From: Milinda Dias Date: Tue, 2 Sep 2025 16:53:49 +0530 Subject: [PATCH 07/24] fix: cleanup formatting --- router-tests/telemetry/telemetry_test.go | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/router-tests/telemetry/telemetry_test.go b/router-tests/telemetry/telemetry_test.go index 205128a295..8f4dd99c4d 100644 --- a/router-tests/telemetry/telemetry_test.go +++ b/router-tests/telemetry/telemetry_test.go @@ -826,6 +826,7 @@ func TestFlakyOperationCacheTelemetry(t *testing.T) { metricdatatest.AssertEqual(t, maxCostMetrics, *integration.GetMetricByName(cacheScope, "router.graphql.cache.cost.max"), metricdatatest.IgnoreTimestamp()) }) }) + t.Run("Validate operation cache telemetry for persisted and non persisted operations", func(t *testing.T) { t.Parallel() metricReader := metric.NewManualReader() @@ -1203,6 +1204,7 @@ func TestFlakyOperationCacheTelemetry(t *testing.T) { metricdatatest.AssertEqual(t, maxCostMetrics, *integration.GetMetricByName(cacheScope, "router.graphql.cache.cost.max"), metricdatatest.IgnoreTimestamp()) }) }) + t.Run("Validate operation cache telemetry when prometheus is also enabled", func(t *testing.T) { t.Parallel() metricReader := metric.NewManualReader() @@ -1544,6 +1546,7 @@ func TestFlakyOperationCacheTelemetry(t *testing.T) { metricdatatest.AssertEqual(t, maxCostMetrics, *integration.GetMetricByName(cacheScope, "router.graphql.cache.cost.max"), metricdatatest.IgnoreTimestamp()) }) }) + t.Run("Validate key and cost eviction metrics with small validation cache size without feature flags", func(t *testing.T) { t.Parallel() metricReader := metric.NewManualReader() @@ -1887,6 +1890,7 @@ func TestFlakyOperationCacheTelemetry(t *testing.T) { metricdatatest.AssertEqual(t, maxCostMetrics, *integration.GetMetricByName(cacheScope, "router.graphql.cache.cost.max"), metricdatatest.IgnoreTimestamp()) }) }) + t.Run("Validate operation cache telemetry for default configuration including feature flags", func(t *testing.T) { t.Parallel() metricReader := metric.NewManualReader() @@ -3558,6 +3562,7 @@ func TestFlakyTelemetry(t *testing.T) { }, }, } + want := metricdata.ScopeMetrics{ Scope: instrumentation.Scope{ Name: "cosmo.router", @@ -4082,6 +4087,7 @@ func TestFlakyTelemetry(t *testing.T) { require.Contains(t, sn[1].Attributes(), otel.WgEnginePersistedOperationCacheHit.Bool(true)) }) }) + t.Run("Should exclude high cardinality attributes only from metrics if custom exporter is defined", func(t *testing.T) { t.Parallel() @@ -4729,6 +4735,7 @@ func TestFlakyTelemetry(t *testing.T) { require.Contains(t, rm.Resource.Attributes(), attribute.String("service.name", "cosmo-router")) require.Len(t, rm.ScopeMetrics, defaultExposedScopedMetricsCount) + scopeMetric := *integration.GetMetricScopeByName(rm.ScopeMetrics, "cosmo.router") require.Len(t, scopeMetric.Metrics, defaultCosmoRouterMetricsCount) @@ -5099,6 +5106,7 @@ func TestFlakyTelemetry(t *testing.T) { }) }) + t.Run("Should remap metric name to configured value", func(t *testing.T) { t.Parallel() @@ -5439,6 +5447,7 @@ func TestFlakyTelemetry(t *testing.T) { metricdatatest.AssertEqual(t, routerInfoMetric, scopeMetric.Metrics[6], metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue()) }) }) + t.Run("Custom span and resource attributes are attached to all metrics and spans / from header", func(t *testing.T) { t.Parallel() @@ -5870,6 +5879,7 @@ func TestFlakyTelemetry(t *testing.T) { metricdatatest.AssertEqual(t, routerInfoMetric, scopeMetric.Metrics[6], metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue()) }) }) + t.Run("Custom span and resource attributes are attached to all metrics and spans / static", func(t *testing.T) { t.Parallel() @@ -6299,6 +6309,7 @@ func TestFlakyTelemetry(t *testing.T) { metricdatatest.AssertEqual(t, routerInfoMetric, scopeMetric.Metrics[6], metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue()) }) }) + t.Run("Requesting a feature flags will emit different router config version and add the feature flag attribute", func(t *testing.T) { t.Parallel() @@ -6939,6 +6950,7 @@ func TestFlakyTelemetry(t *testing.T) { require.Equal(t, sdktrace.Status{Code: codes.Unset}, sn[8].Status()) }) }) + t.Run("Origin connectivity issue is traced", func(t *testing.T) { t.Parallel() @@ -7520,6 +7532,7 @@ func TestFlakyTelemetry(t *testing.T) { integration.AssertAttributeNotInSet(t, resClFiltered.DataPoints[1].Attributes, otel.WgOperationName.String("")) }) }) + t.Run("Custom Metric Attributes", func(t *testing.T) { t.Parallel() @@ -8091,6 +8104,7 @@ func TestFlakyTelemetry(t *testing.T) { metricdatatest.AssertEqual(t, want, scopeMetric, metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue()) }) }) + t.Run("should not remap high cardinality fields when using cloud exporter but include custom metric attributes", func(t *testing.T) { t.Parallel() @@ -8732,6 +8746,7 @@ func TestFlakyTelemetry(t *testing.T) { require.True(t, found, "expected to find a datapoint with subgraph name and id in the metrics") }) }) + t.Run("Tracing is not affected by custom metric attributes", func(t *testing.T) { t.Parallel() @@ -9176,6 +9191,7 @@ func TestFlakyTelemetry(t *testing.T) { assert.ErrorAs(t, err, &expectedErr) }) }) + t.Run("custom trace metrics with expression", func(t *testing.T) { t.Parallel() @@ -9580,6 +9596,7 @@ func TestFlakyTelemetry(t *testing.T) { }) }) }) + t.Run("verify attribute expressions with subgraph in the expression", func(t *testing.T) { t.Run("verify subgraph expression should only be present for engine fetch", func(t *testing.T) { t.Parallel() @@ -9637,8 +9654,8 @@ func TestFlakyTelemetry(t *testing.T) { require.IsType(t, metricdata.Sum[int64]{}, httpRequestsMetric.Data) data2 := httpRequestsMetric.Data.(metricdata.Sum[int64]) - atts := data2.DataPoints[0].Attributes - _, ok := atts.Value(attribute.Key(key)) + attrs := data2.DataPoints[0].Attributes + _, ok := attrs.Value(attribute.Key(key)) require.False(t, ok) }) }) @@ -10191,6 +10208,7 @@ func TestExcludeAttributesWithCustomExporter(t *testing.T) { metricdatatest.AssertEqual(t, connectionMetrics, *integration.GetMetricByName(engineScope, "router.engine.connections"), metricdatatest.IgnoreTimestamp()) }) }) + t.Run(fmt.Sprintf("cache metrics for %s", usingCustomExporter), func(t *testing.T) { t.Parallel() metricReader := metric.NewManualReader() From c6c224bdb1b44287f09658f4922b7907b284939e Mon Sep 17 00:00:00 2001 From: Milinda Dias Date: Wed, 3 Sep 2025 17:49:14 +0530 Subject: [PATCH 08/24] fix: return error --- router/internal/retrytransport/manager.go | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/router/internal/retrytransport/manager.go b/router/internal/retrytransport/manager.go index 2cab10bae9..3a0a2d9fe6 100644 --- a/router/internal/retrytransport/manager.go +++ b/router/internal/retrytransport/manager.go @@ -1,7 +1,6 @@ package retrytransport import ( - "errors" "fmt" "net/http" "sync" @@ -56,8 +55,6 @@ func (m *Manager) Initialize( m.lock.Lock() defer m.lock.Unlock() - var joinErr error - defaultSgNames := make([]string, 0, len(subgraphs)) customSgNames := make([]string, 0, len(subgraphs)) @@ -74,11 +71,11 @@ func (m *Manager) Initialize( // First validate and add expressions for base retry options if needed if len(defaultSgNames) > 0 && baseRetryOptions.Enabled { if baseRetryOptions.Algorithm != BackoffJitter { - joinErr = errors.Join(joinErr, fmt.Errorf("unsupported retry algorithm: %s", baseRetryOptions.Algorithm)) + return fmt.Errorf("unsupported retry algorithm: %s", baseRetryOptions.Algorithm) } else { err := m.exprManager.AddExpression(baseRetryOptions.Expression) if err != nil { - joinErr = errors.Join(joinErr, fmt.Errorf("failed to add base retry expression: %w", err)) + return fmt.Errorf("failed to add base retry expression: %w", err) } else { // Only assign default options if validation succeeds for _, sgName := range defaultSgNames { @@ -93,20 +90,17 @@ func (m *Manager) Initialize( for _, sgName := range customSgNames { entry, ok := subgraphRetryOptions[sgName] if !ok { - joinErr = errors.Join(joinErr, errors.New("retry config not found for subgraph "+sgName)) - continue + return fmt.Errorf("failed to add base retry expression: %s", sgName) } if entry.Algorithm != BackoffJitter { - joinErr = errors.Join(joinErr, fmt.Errorf("unsupported retry algorithm: %s", entry.Algorithm)) - continue + return fmt.Errorf("unsupported retry algorithm for subgraph %s: %s", sgName, baseRetryOptions.Algorithm) } // Validate expression before assigning options err := m.exprManager.AddExpression(entry.Expression) if err != nil { - joinErr = errors.Join(joinErr, fmt.Errorf("failed to add retry expression for subgraph %s: %w", sgName, err)) - continue + return fmt.Errorf("failed to add retry expression for subgraph %s: %w", sgName, err) } // Create a new copy of the options @@ -114,7 +108,7 @@ func (m *Manager) Initialize( m.retries[sgName] = &opts } - return joinErr + return nil } func (m *Manager) GetSubgraphOptions(name string) *RetryOptions { From dae502d75dbafc117d49932fccb94ab4e336c277 Mon Sep 17 00:00:00 2001 From: Milinda Dias Date: Wed, 3 Sep 2025 18:38:26 +0530 Subject: [PATCH 09/24] fix: review comments --- router-tests/retry_test.go | 159 +++++++++++++++++++++++++++++- router/core/retry_builder_test.go | 44 --------- 2 files changed, 156 insertions(+), 47 deletions(-) diff --git a/router-tests/retry_test.go b/router-tests/retry_test.go index 9df5208fad..4e5fc02e6e 100644 --- a/router-tests/retry_test.go +++ b/router-tests/retry_test.go @@ -589,7 +589,6 @@ func TestRetryPerSubgraph(t *testing.T) { t.Run("verify invalid algorithm is detected for per subgraphs", func(t *testing.T) { t.Parallel() - // Configure per-subgraph retry: employees gets 3 retries, test1 gets 1 retry opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{ Subgraphs: map[string]config.GlobalSubgraphRequestRule{ "employees": { @@ -618,6 +617,94 @@ func TestRetryPerSubgraph(t *testing.T) { require.ErrorContains(t, err, "unsupported retry algorithm") }) + t.Run("verify invalid expression is detected for base", func(t *testing.T) { + t.Parallel() + + opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{ + All: config.GlobalSubgraphRequestRule{ + BackoffJitterRetry: config.BackoffJitterRetry{ + Enabled: true, + MaxAttempts: 2, + MaxDuration: 2 * time.Second, + Interval: 10 * time.Millisecond, + Expression: "truethere", + }, + }, + }) + options := core.WithSubgraphRetryOptions(opts) + + err := testenv.RunWithError(t, &testenv.Config{ + NoRetryClient: true, + RouterOptions: []core.Option{ + options, + }, + }, func(t *testing.T, xEnv *testenv.Environment) { + require.Fail(t, "expected initialization to fail due to invalid algorithm") + }) + + require.ErrorContains(t, err, "failed to add base retry expression: failed to compile retry expression: line 1, column 0: unknown name truethere") + }) + + t.Run("verify invalid expression is detected per subgraphs", func(t *testing.T) { + t.Parallel() + + opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{ + Subgraphs: map[string]config.GlobalSubgraphRequestRule{ + "employees": { + BackoffJitterRetry: config.BackoffJitterRetry{ + Enabled: true, + MaxAttempts: 2, + MaxDuration: 2 * time.Second, + Interval: 10 * time.Millisecond, + Expression: "truethere", + }, + }, + }, + }) + options := core.WithSubgraphRetryOptions(opts) + + err := testenv.RunWithError(t, &testenv.Config{ + NoRetryClient: true, + RouterOptions: []core.Option{ + options, + }, + }, func(t *testing.T, xEnv *testenv.Environment) { + require.Fail(t, "expected initialization to fail due to invalid algorithm") + }) + + require.ErrorContains(t, err, "failed to add retry expression for subgraph employees: failed to compile retry expression: line 1, column 0: unknown name truethere") + }) + + t.Run("verify valid expression and algorithm", func(t *testing.T) { + t.Parallel() + + opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{ + Subgraphs: map[string]config.GlobalSubgraphRequestRule{ + "employees": { + BackoffJitterRetry: config.BackoffJitterRetry{ + Enabled: true, + Algorithm: "backoff_jitter", + MaxAttempts: 2, + MaxDuration: 2 * time.Second, + Interval: 10 * time.Millisecond, + Expression: "true", + }, + }, + }, + }) + options := core.WithSubgraphRetryOptions(opts) + + err := testenv.RunWithError(t, &testenv.Config{ + NoRetryClient: true, + RouterOptions: []core.Option{ + options, + }, + }, func(t *testing.T, xEnv *testenv.Environment) { + }) + + require.NoError(t, err) + }) + t.Run("verify retries are applied per subgraph", func(t *testing.T) { t.Parallel() @@ -690,7 +777,7 @@ func TestRetryPerSubgraph(t *testing.T) { }) require.NoError(t, err) - require.Contains(t, resTest1.Body, `"statusCode":502`) + require.Equal(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'test1', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"floatField":null}}`, resTest1.Body) require.Equal(t, int32(employeesMax+1), employeesCalls.Load()) require.Equal(t, int32(test1Max+1), test1Calls.Load()) }) @@ -768,9 +855,75 @@ func TestRetryPerSubgraph(t *testing.T) { }) require.NoError(t, err) - require.Contains(t, resTest1.Body, `"statusCode":502`) + require.Equal(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'test1', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"floatField":null}}`, resTest1.Body) require.Equal(t, int32(employeesMax+1), employeesCalls.Load()) require.Equal(t, int32(test1Max+1), test1Calls.Load()) }) }) + + t.Run("verify retries are applied when only all and no subgraph specific overrides are present", func(t *testing.T) { + t.Parallel() + + employeesCalls := atomic.Int32{} + test1Calls := atomic.Int32{} + + // Configure per-subgraph retry: employees gets 3 retries, test1 gets 1 retry + generalMax := 3 + opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{ + All: config.GlobalSubgraphRequestRule{ + BackoffJitterRetry: config.BackoffJitterRetry{ + Enabled: true, + MaxAttempts: generalMax, + MaxDuration: 2 * time.Second, + Interval: 10 * time.Millisecond, + Expression: "true", + }, + }, + }) + options := core.WithSubgraphRetryOptions(opts) + + testenv.Run(t, &testenv.Config{ + NoRetryClient: true, + RouterOptions: []core.Option{ + options, + }, + Subgraphs: testenv.SubgraphsConfig{ + Employees: testenv.SubgraphConfig{ + Middleware: func(_ http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusBadGateway) + employeesCalls.Add(1) + }) + }, + }, + Test1: testenv.SubgraphConfig{ + Middleware: func(handler http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusBadGateway) + test1Calls.Add(1) + }) + }, + }, + }, + }, func(t *testing.T, xEnv *testenv.Environment) { + // 1) Call employees-only query; expect employees subgraph to be retried generalMax times (attempts = retries + 1) + resEmp, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{ + Query: `query employees { employees { id } }`, + }) + require.NoError(t, err) + require.Equal(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'employees', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"employees":null}}`, resEmp.Body) + require.Equal(t, int32(generalMax+1), employeesCalls.Load()) + require.Equal(t, int32(0), test1Calls.Load()) + + // 2) Call test1-only query; expect test subgraph to be retried test1Max times (attempts = retries + 1) + resTest1, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{ + Query: `query { floatField(arg: 1.5) }`, + }) + + require.NoError(t, err) + require.Equal(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'test1', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"floatField":null}}`, resTest1.Body) + require.Equal(t, int32(generalMax+1), employeesCalls.Load()) + require.Equal(t, int32(generalMax+1), test1Calls.Load()) + }) + }) } diff --git a/router/core/retry_builder_test.go b/router/core/retry_builder_test.go index 6a2d34cdab..7bfeda9fcd 100644 --- a/router/core/retry_builder_test.go +++ b/router/core/retry_builder_test.go @@ -438,47 +438,3 @@ func TestBuildRetryFunction(t *testing.T) { assert.False(t, fn(errors.New("connection refused"), mutationReq, nil, expression)) }) } - -// TODO: Fix -//func TestProcessRetryOptions(t *testing.T) { -// t.Run("process invalid algorithm", func(t *testing.T) { -// algorithm := "abcdee" -// _, err := ProcessRetryOptions(retrytransport.RetryOptions{ -// Enabled: true, -// Algorithm: algorithm, -// }) -// -// expectedError := fmt.Sprintf("unsupported retry algorithm: %s", algorithm) -// assert.ErrorContains(t, err, expectedError) -// }) -// -// t.Run("process invalid algorithm when retries are disabled", func(t *testing.T) { -// algorithm := "abcdee" -// _, err := ProcessRetryOptions(retrytransport.RetryOptions{ -// Enabled: false, -// Algorithm: algorithm, -// }) -// assert.NoError(t, err) -// }) -// -// t.Run("process invalid expression", func(t *testing.T) { -// _, err := ProcessRetryOptions(retrytransport.RetryOptions{ -// Enabled: true, -// Algorithm: "backoff_jitter", -// Expression: "invalid syntax +++", -// }) -// -// assert.ErrorContains(t, err, "failed to build retry function") -// }) -// -// t.Run("process valid options", func(t *testing.T) { -// options := retrytransport.RetryOptions{ -// Enabled: true, -// Algorithm: "backoff_jitter", -// Expression: "statusCode == 500 || IsTimeout() || IsConnectionError()", -// } -// response, err := ProcessRetryOptions(options) -// assert.NoError(t, err) -// assert.NotSame(t, &options, response) -// }) -//} From 3c7647c333cb9ba703b6ddfa88fab3e6ffee1df2 Mon Sep 17 00:00:00 2001 From: Milinda Dias Date: Wed, 3 Sep 2025 18:58:24 +0530 Subject: [PATCH 10/24] fix: tests --- router-tests/retry_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/router-tests/retry_test.go b/router-tests/retry_test.go index 4e5fc02e6e..5afd96c3a6 100644 --- a/router-tests/retry_test.go +++ b/router-tests/retry_test.go @@ -472,8 +472,8 @@ func TestFlakyRetry(t *testing.T) { Employees: testenv.SubgraphConfig{ Middleware: func(_ http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { - w.WriteHeader(http.StatusTooManyRequests) w.Header().Set("Retry-After", strconv.Itoa(emptyRetryInterval)) + w.WriteHeader(http.StatusTooManyRequests) serviceCallsCounter.Add(1) }) }, From c7b4040ed3dc248a2965e4ac6893318aeaa8c891 Mon Sep 17 00:00:00 2001 From: Milinda Dias Date: Wed, 3 Sep 2025 23:47:46 +0530 Subject: [PATCH 11/24] fix: review comments --- router/core/router.go | 2 +- router/internal/retrytransport/manager.go | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/router/core/router.go b/router/core/router.go index afccb42fbe..1b2ab01ffb 100644 --- a/router/core/router.go +++ b/router/core/router.go @@ -1913,7 +1913,7 @@ func NewSubgraphRetryOptions(cfg config.TrafficShapingRules) *SubgraphRetryOptio if cfg.All.BackoffJitterRetry.Enabled { entry.All = newRetryConfig(cfg.All.BackoffJitterRetry) } - // Subgraph specific circuit breakers + // Subgraph specific retry configs for k, v := range cfg.Subgraphs { entry.SubgraphMap[k] = newRetryConfig(v.BackoffJitterRetry) } diff --git a/router/internal/retrytransport/manager.go b/router/internal/retrytransport/manager.go index 3a0a2d9fe6..1c7fdff1ad 100644 --- a/router/internal/retrytransport/manager.go +++ b/router/internal/retrytransport/manager.go @@ -52,9 +52,6 @@ func (m *Manager) Initialize( subgraphRetryOptions map[string]RetryOptions, subgraphs []*nodev1.Subgraph, ) error { - m.lock.Lock() - defer m.lock.Unlock() - defaultSgNames := make([]string, 0, len(subgraphs)) customSgNames := make([]string, 0, len(subgraphs)) @@ -119,8 +116,8 @@ func (m *Manager) GetSubgraphOptions(name string) *RetryOptions { m.lock.RLock() defer m.lock.RUnlock() - if circuitBreaker, ok := m.retries[name]; ok { - return circuitBreaker + if retryOptions, ok := m.retries[name]; ok { + return retryOptions } return nil } From a976f1614a37dc5ea7a8128cf86e13961c7c4e54 Mon Sep 17 00:00:00 2001 From: Milinda Dias Date: Tue, 16 Sep 2025 21:38:18 +0530 Subject: [PATCH 12/24] fix: merge resolving --- router/core/router_config.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/router/core/router_config.go b/router/core/router_config.go index 6b95ba8335..62ae89bd65 100644 --- a/router/core/router_config.go +++ b/router/core/router_config.go @@ -8,7 +8,7 @@ import ( nodev1 "github.com/wundergraph/cosmo/router/gen/proto/wg/cosmo/node/v1" "github.com/wundergraph/cosmo/router/internal/graphqlmetrics" "github.com/wundergraph/cosmo/router/internal/persistedoperation" - rd "github.com/wundergraph/cosmo/router/internal/persistedoperation/operationstorage/redis" + rd "github.com/wundergraph/cosmo/router/internal/rediscloser" "github.com/wundergraph/cosmo/router/pkg/config" "github.com/wundergraph/cosmo/router/pkg/controlplane/configpoller" "github.com/wundergraph/cosmo/router/pkg/controlplane/selfregister" From f4090fa1b45ffd7e4207c95595a65da314f94af4 Mon Sep 17 00:00:00 2001 From: Milinda Dias Date: Tue, 16 Sep 2025 21:38:27 +0530 Subject: [PATCH 13/24] fix: upgrade request --- router-tests/circuit_breaker_test.go | 134 ++++ router-tests/retry_test.go | 102 +++ router-tests/websocket_test.go | 13 + router/core/engine_loader_hooks.go | 2 - router/core/transport.go | 32 +- router/internal/circuit/breaker.go | 27 +- router/internal/context/keys.go | 2 - .../retrytransport/retry_transport.go | 33 +- .../retrytransport/retry_transport_test.go | 728 ++++++++---------- router/internal/traceclient/traceclient.go | 33 +- 10 files changed, 646 insertions(+), 460 deletions(-) diff --git a/router-tests/circuit_breaker_test.go b/router-tests/circuit_breaker_test.go index 7e7f1b60eb..2ee4ef21c0 100644 --- a/router-tests/circuit_breaker_test.go +++ b/router-tests/circuit_breaker_test.go @@ -2,6 +2,7 @@ package integration import ( "context" + "github.com/gorilla/websocket" "net/http" "sort" "sync/atomic" @@ -623,6 +624,139 @@ func TestCircuitBreaker(t *testing.T) { }) }) + t.Run("verify circuit breaker tripping on upgrade requests", func(t *testing.T) { + t.Parallel() + + const failedTries int64 = 3 + + const timestampMessage = `{"type":"next","id":"1","payload":{"data":{"currentTime":{"unixTime":1,"timeStamp":"2021-09-01T12:00:00Z"}}}}` + const completeMessage = `{"type":"complete","id":"1"}` + const defaultErrorMessage = `{"id":"1","type":"error","payload":[{"message":"Internal server error"}]}` + + breaker := getCircuitBreakerWithDefaults() + breaker.RequestThreshold = 3 + breaker.ErrorThresholdPercentage = 100 + + breaker.NumBuckets = 1 + breaker.RollingDuration = 5000 * time.Millisecond + + trafficConfig := getTrafficConfigWithTimeout(breaker, 1*time.Second) + + employeesCalls := atomic.Int64{} + + testenv.Run(t, &testenv.Config{ + ModifyEngineExecutionConfiguration: func(engineExecutionConfiguration *config.EngineExecutionConfiguration) { + engineExecutionConfiguration.WebSocketClientReadTimeout = time.Millisecond * 2000 + }, + LogObservation: testenv.LogObservationConfig{ + Enabled: true, + LogLevel: zapcore.DebugLevel, + }, + RouterOptions: []core.Option{ + core.WithSubgraphCircuitBreakerOptions(core.NewSubgraphCircuitBreakerOptions(trafficConfig)), + core.WithSubgraphTransportOptions(core.NewSubgraphTransportOptions(trafficConfig)), + }, + Subgraphs: testenv.SubgraphsConfig{ + Employees: testenv.SubgraphConfig{ + Middleware: func(handler http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + employeesCalls.Add(1) + if employeesCalls.Load() <= failedTries { + simulateConnectionFailureOnClose(w) + return + } + + upgrader := websocket.Upgrader{ + CheckOrigin: func(r *http.Request) bool { + return true + }, + Subprotocols: []string{"graphql-transport-ws"}, + } + conn, err := upgrader.Upgrade(w, r, nil) + require.NoError(t, err) + defer func() { + _ = conn.Close() + }() + + _, _, err = testenv.WSReadMessage(t, conn) + require.NoError(t, err) + + err = testenv.WSWriteMessage(t, conn, websocket.TextMessage, []byte(`{"type":"connection_ack"}`)) + require.NoError(t, err) + + err = testenv.WSWriteMessage(t, conn, websocket.TextMessage, []byte(timestampMessage)) + require.NoError(t, err) + + err = testenv.WSWriteMessage(t, conn, websocket.TextMessage, []byte(completeMessage)) + require.NoError(t, err) + }) + }, + }, + }, + }, func(t *testing.T, xEnv *testenv.Environment) { + for i := range failedTries + 2 { + conn := xEnv.InitGraphQLWebSocketConnection(nil, nil, nil) + err := testenv.WSWriteJSON(t, conn, &testenv.WebSocketMessage{ + ID: "1", + Type: "subscribe", + Payload: []byte(`{"query":"subscription { currentTime { unixTime timeStamp }}"}`), + }) + require.NoError(t, err) + + _, message, err := testenv.WSReadMessage(t, conn) + require.NoError(t, err) + + require.Equal(t, defaultErrorMessage, string(message)) + require.NoError(t, conn.Close()) + + switch { + case i < breaker.RequestThreshold-1: + require.Zero(t, xEnv.Observer().FilterMessage("Circuit breaker status changed").Len()) + case i == breaker.RequestThreshold-1: + require.Equal(t, 1, xEnv.Observer().FilterMessage("Circuit breaker status changed").Len()) + case i > breaker.RequestThreshold-1: + expectedCount := i - (breaker.RequestThreshold - 1) + require.Equal(t, int(expectedCount), xEnv.Observer().FilterMessage("Circuit breaker open, request callback did not execute").Len()) + } + + } + + require.Equal(t, failedTries, employeesCalls.Load()) + + // Wait for current bucket to be cleaned up + time.Sleep(breaker.RollingDuration + time.Millisecond*500) + + // ==== + // Verify a success case with messages validated from here onwards + // ==== + + // Sending a complete must stop the subscription + conn := xEnv.InitGraphQLWebSocketConnection(nil, nil, nil) + err := testenv.WSWriteJSON(t, conn, &testenv.WebSocketMessage{ + ID: "1", + Type: "subscribe", + Payload: []byte(`{"query":"subscription { currentTime { unixTime timeStamp }}"}`), + }) + require.NoError(t, err) + + _, message, err := testenv.WSReadMessage(t, conn) + require.NoError(t, err) + require.JSONEq(t, timestampMessage, string(message)) + + err = testenv.WSWriteJSON(t, conn, &testenv.WebSocketMessage{ID: "1", Type: "complete"}) + require.NoError(t, err) + + err = conn.SetReadDeadline(time.Now().Add(2 * time.Second)) + require.NoError(t, err) + + _, actualCompleteMessage, err := testenv.WSReadMessage(t, conn) + require.NoError(t, err) + require.JSONEq(t, completeMessage, string(actualCompleteMessage)) + + xEnv.WaitForSubscriptionCount(0, time.Second*5) + }) + }) + t.Run("circuit breaker metrics", func(t *testing.T) { t.Parallel() diff --git a/router-tests/retry_test.go b/router-tests/retry_test.go index 5afd96c3a6..d52c3ef1ab 100644 --- a/router-tests/retry_test.go +++ b/router-tests/retry_test.go @@ -1,6 +1,7 @@ package integration import ( + "github.com/gorilla/websocket" "net/http" "strconv" "sync/atomic" @@ -926,4 +927,105 @@ func TestRetryPerSubgraph(t *testing.T) { require.Equal(t, int32(generalMax+1), test1Calls.Load()) }) }) + + t.Run("verify retry on upgrade requests", func(t *testing.T) { + t.Parallel() + + const failedTries int64 = 2 + + const timestampMessage = `{"type":"next","id":"1","payload":{"data":{"currentTime":{"unixTime":1,"timeStamp":"2021-09-01T12:00:00Z"}}}}` + const completeMessage = `{"type":"complete","id":"1"}` + + employeesCalls := atomic.Int64{} + + testenv.Run(t, &testenv.Config{ + ModifyEngineExecutionConfiguration: func(engineExecutionConfiguration *config.EngineExecutionConfiguration) { + engineExecutionConfiguration.WebSocketClientReadTimeout = time.Millisecond * 2000 + }, + + RouterOptions: []core.Option{ + core.WithSubgraphRetryOptions(core.NewSubgraphRetryOptions(config.TrafficShapingRules{ + All: config.GlobalSubgraphRequestRule{ + BackoffJitterRetry: config.BackoffJitterRetry{ + Enabled: true, + MaxAttempts: 5, + MaxDuration: 10 * time.Second, + Interval: 200 * time.Millisecond, + Expression: "IsRetryableStatusCode() || IsConnectionError() || IsTimeout()", + }, + }, + })), + }, + Subgraphs: testenv.SubgraphsConfig{ + Employees: testenv.SubgraphConfig{ + Middleware: func(handler http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + employeesCalls.Add(1) + if employeesCalls.Load() <= failedTries { + w.WriteHeader(http.StatusBadGateway) + return + } + + upgrader := websocket.Upgrader{ + CheckOrigin: func(r *http.Request) bool { + return true + }, + Subprotocols: []string{"graphql-transport-ws"}, + } + conn, err := upgrader.Upgrade(w, r, nil) + require.NoError(t, err) + defer func() { + _ = conn.Close() + }() + + _, _, err = testenv.WSReadMessage(t, conn) + require.NoError(t, err) + + err = testenv.WSWriteMessage(t, conn, websocket.TextMessage, []byte(`{"type":"connection_ack"}`)) + require.NoError(t, err) + + err = testenv.WSWriteMessage(t, conn, websocket.TextMessage, []byte(timestampMessage)) + require.NoError(t, err) + + err = testenv.WSWriteMessage(t, conn, websocket.TextMessage, []byte(completeMessage)) + require.NoError(t, err) + }) + }, + }, + }, + }, func(t *testing.T, xEnv *testenv.Environment) { + conn := xEnv.InitGraphQLWebSocketConnection(nil, nil, nil) + err := testenv.WSWriteJSON(t, conn, &testenv.WebSocketMessage{ + ID: "1", + Type: "subscribe", + Payload: []byte(`{"query":"subscription { currentTime { unixTime timeStamp }}"}`), + }) + require.NoError(t, err) + + // Read a result and store its timestamp, next result should be 1 second later + _, messageBytes, err := testenv.WSReadMessage(t, conn) + require.NoError(t, err) + + require.Equal(t, failedTries+1, employeesCalls.Load()) + + // ==== + // Verify the messages are correct from here onwards + // ==== + require.JSONEq(t, timestampMessage, string(messageBytes)) + + // Sending a complete must stop the subscription + err = testenv.WSWriteJSON(t, conn, &testenv.WebSocketMessage{ID: "1", Type: "complete"}) + require.NoError(t, err) + + err = conn.SetReadDeadline(time.Now().Add(2 * time.Second)) + require.NoError(t, err) + + _, actualCompleteMessage, err := testenv.WSReadMessage(t, conn) + require.NoError(t, err) + require.JSONEq(t, completeMessage, string(actualCompleteMessage)) + + require.NoError(t, conn.Close()) + xEnv.WaitForSubscriptionCount(0, time.Second*5) + }) + }) } diff --git a/router-tests/websocket_test.go b/router-tests/websocket_test.go index 5eb16bacb5..7386cce290 100644 --- a/router-tests/websocket_test.go +++ b/router-tests/websocket_test.go @@ -706,6 +706,18 @@ func TestWebSockets(t *testing.T) { t.Run("empty allow lists should allow all headers and query args", func(t *testing.T) { t.Parallel() + opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{ + All: config.GlobalSubgraphRequestRule{ + BackoffJitterRetry: config.BackoffJitterRetry{ + Enabled: true, + MaxAttempts: 5, + MaxDuration: 10 * time.Second, + Interval: 200 * time.Millisecond, + Expression: "true", + }, + }, + }) + headerRules := config.HeaderRules{ All: &config.GlobalHeaderRule{ Request: []*config.RequestHeaderRule{ @@ -734,6 +746,7 @@ func TestWebSockets(t *testing.T) { }, RouterOptions: []core.Option{ core.WithHeaderRules(headerRules), + core.WithSubgraphRetryOptions(opts), }, Subgraphs: testenv.SubgraphsConfig{ Employees: testenv.SubgraphConfig{ diff --git a/router/core/engine_loader_hooks.go b/router/core/engine_loader_hooks.go index e3da6fe26e..579553e6f8 100644 --- a/router/core/engine_loader_hooks.go +++ b/router/core/engine_loader_hooks.go @@ -92,8 +92,6 @@ func (f *engineLoaderHooks) OnLoad(ctx context.Context, ds resolve.DataSourceInf start := time.Now() - ctx = context.WithValue(ctx, rcontext.CurrentSubgraphContextKey{}, ds.Name) - duration := atomic.Int64{} ctx = context.WithValue(ctx, rcontext.FetchTimingKey, &duration) diff --git a/router/core/transport.go b/router/core/transport.go index f16533210f..2211936ba6 100644 --- a/router/core/transport.go +++ b/router/core/transport.go @@ -84,31 +84,41 @@ func NewCustomTransport( // As a workaround we pass in a function that can be used to get the logger from within the round tripper getRequestContextLogger := func(req *http.Request) *zap.Logger { reqContext := getRequestContext(req.Context()) + if reqContext == nil { + return zap.NewNop() + } return reqContext.Logger() } + getActiveSubgraphName := func(req *http.Request) string { + reqContext := getRequestContext(req.Context()) + if reqContext == nil { + return "" + } + subgraph := reqContext.ActiveSubgraph(req) + if subgraph != nil { + return subgraph.Name + } + return "" + } + if enableTraceClient { - getValuesFromRequest := func(ctx context.Context, req *http.Request) (*expr.Context, string) { + getExprContext := func(ctx context.Context, req *http.Request) *expr.Context { reqContext := getRequestContext(ctx) if reqContext == nil { - return &expr.Context{}, "" - } - - var activeSubgraphName string - if activeSubgraph := reqContext.ActiveSubgraph(req); activeSubgraph != nil { - activeSubgraphName = activeSubgraph.Name + return &expr.Context{} } - return &reqContext.expressionContext, activeSubgraphName + return &reqContext.expressionContext } - baseRoundTripper = traceclient.NewTraceInjectingRoundTripper(baseRoundTripper, connectionMetricStore, getValuesFromRequest) + baseRoundTripper = traceclient.NewTraceInjectingRoundTripper(baseRoundTripper, connectionMetricStore, getExprContext, getActiveSubgraphName) } if breaker.HasCircuits() { - baseRoundTripper = circuit.NewCircuitTripper(baseRoundTripper, breaker, getRequestContextLogger) + baseRoundTripper = circuit.NewCircuitTripper(baseRoundTripper, breaker, getRequestContextLogger, getActiveSubgraphName) } if retryManager.IsEnabled() { - ct.roundTripper = retrytransport.NewRetryHTTPTransport(baseRoundTripper, getRequestContextLogger, retryManager) + ct.roundTripper = retrytransport.NewRetryHTTPTransport(baseRoundTripper, getRequestContextLogger, retryManager, getActiveSubgraphName) } else { ct.roundTripper = baseRoundTripper } diff --git a/router/internal/circuit/breaker.go b/router/internal/circuit/breaker.go index bbabdb42f2..382f355032 100644 --- a/router/internal/circuit/breaker.go +++ b/router/internal/circuit/breaker.go @@ -4,34 +4,27 @@ import ( "context" "net/http" - rcontext "github.com/wundergraph/cosmo/router/internal/context" "go.uber.org/zap" ) type Breaker struct { - roundTripper http.RoundTripper - loggerFunc func(req *http.Request) *zap.Logger - circuitBreaker *Manager + roundTripper http.RoundTripper + loggerFunc func(req *http.Request) *zap.Logger + circuitBreaker *Manager + getActiveSubgraphName func(req *http.Request) string } -func NewCircuitTripper(roundTripper http.RoundTripper, breaker *Manager, logger func(req *http.Request) *zap.Logger) *Breaker { +func NewCircuitTripper(roundTripper http.RoundTripper, breaker *Manager, logger func(req *http.Request) *zap.Logger, getActiveSubgraphName func(req *http.Request) string) *Breaker { return &Breaker{ - circuitBreaker: breaker, - loggerFunc: logger, - roundTripper: roundTripper, + circuitBreaker: breaker, + loggerFunc: logger, + roundTripper: roundTripper, + getActiveSubgraphName: getActiveSubgraphName, } } func (rt *Breaker) RoundTrip(req *http.Request) (resp *http.Response, err error) { - ctx := req.Context() - - var subgraph string - subgraphCtxVal := ctx.Value(rcontext.CurrentSubgraphContextKey{}) - if subgraphCtxVal != nil { - if sg, ok := subgraphCtxVal.(string); ok { - subgraph = sg - } - } + subgraph := rt.getActiveSubgraphName(req) // If there is no circuit defined for this subgraph circuit := rt.circuitBreaker.GetCircuitBreaker(subgraph) diff --git a/router/internal/context/keys.go b/router/internal/context/keys.go index d1706f4ab4..3524280e02 100644 --- a/router/internal/context/keys.go +++ b/router/internal/context/keys.go @@ -4,8 +4,6 @@ // instead of being moved into core package context -type CurrentSubgraphContextKey struct{} - type ContextKey int const ( diff --git a/router/internal/retrytransport/retry_transport.go b/router/internal/retrytransport/retry_transport.go index 83cdae16c4..26fcee92e3 100644 --- a/router/internal/retrytransport/retry_transport.go +++ b/router/internal/retrytransport/retry_transport.go @@ -7,16 +7,15 @@ import ( "strconv" "time" - rcontext "github.com/wundergraph/cosmo/router/internal/context" - "github.com/cloudflare/backoff" "go.uber.org/zap" ) type RetryHTTPTransport struct { - roundTripper http.RoundTripper - getRequestLogger requestLoggerGetter - retryManager *Manager + roundTripper http.RoundTripper + getRequestLogger requestLoggerGetter + retryManager *Manager + getActiveSubgraph func(req *http.Request) string } // parseRetryAfterHeader parses the Retry-After header value according to RFC 7231. @@ -74,15 +73,12 @@ func shouldUseRetryAfter(logger *zap.Logger, resp *http.Response, maxDuration ti return duration, duration > 0 } -func NewRetryHTTPTransport( - roundTripper http.RoundTripper, - getRequestLogger requestLoggerGetter, - retryManager *Manager, -) *RetryHTTPTransport { +func NewRetryHTTPTransport(roundTripper http.RoundTripper, getRequestLogger requestLoggerGetter, retryManager *Manager, getActiveSubgraph func(req *http.Request) string) *RetryHTTPTransport { return &RetryHTTPTransport{ - roundTripper: roundTripper, - getRequestLogger: getRequestLogger, - retryManager: retryManager, + roundTripper: roundTripper, + getRequestLogger: getRequestLogger, + getActiveSubgraph: getActiveSubgraph, + retryManager: retryManager, } } @@ -93,16 +89,10 @@ func (rt *RetryHTTPTransport) RoundTrip(req *http.Request) (*http.Response, erro return resp, nil } - var subgraph string - subgraphCtxVal := req.Context().Value(rcontext.CurrentSubgraphContextKey{}) - if subgraphCtxVal != nil { - if sg, ok := subgraphCtxVal.(string); ok { - subgraph = sg - } - } + activeSubgraph := rt.getActiveSubgraph(req) + retryOptions := rt.retryManager.GetSubgraphOptions(activeSubgraph) // If there is no option defined for this subgraph - retryOptions := rt.retryManager.GetSubgraphOptions(subgraph) if retryOptions == nil { return resp, err } @@ -115,6 +105,7 @@ func (rt *RetryHTTPTransport) RoundTrip(req *http.Request) (*http.Response, erro // Retry logic retries := 0 for (rt.retryManager.Retry(err, req, resp, retryOptions.Expression)) && retries < retryOptions.MaxRetryCount { + retries++ // Check if we should use Retry-After header for 429 responses diff --git a/router/internal/retrytransport/retry_transport_test.go b/router/internal/retrytransport/retry_transport_test.go index 5aa9163d49..e58a1228f2 100644 --- a/router/internal/retrytransport/retry_transport_test.go +++ b/router/internal/retrytransport/retry_transport_test.go @@ -2,7 +2,6 @@ package retrytransport import ( "bytes" - "context" "errors" "fmt" "io" @@ -17,12 +16,19 @@ import ( "github.com/stretchr/testify/assert" "go.uber.org/zap" - rcontext "github.com/wundergraph/cosmo/router/internal/context" "github.com/wundergraph/cosmo/router/internal/expr" ) const defaultMaxDuration = 100 * time.Second +var loggerFunc = func(req *http.Request) *zap.Logger { return zap.NewNop() } + +var getActiveSubgraph = func(subgraphName string) func(req *http.Request) string { + return func(req *http.Request) string { + return subgraphName + } +} + // simpleShouldRetry provides simple retry logic for testing the transport implementation func simpleShouldRetry(err error, req *http.Request, resp *http.Response) bool { // Simple logic for testing - retry on 5xx status codes or any error @@ -55,11 +61,6 @@ func (dt *MockTransport) RoundTrip(req *http.Request) (*http.Response, error) { return dt.handler(req) } -func withSubgraph(req *http.Request, name string) *http.Request { - ctx := context.WithValue(req.Context(), rcontext.CurrentSubgraphContextKey{}, name) - return req.WithContext(ctx) -} - func newTestManager(shouldRetry func(error, *http.Request, *http.Response) bool, onRetry OnRetryFunc, opts RetryOptions, subgraphName string) *Manager { mgr := NewManager(expr.NewRetryExpressionManager(), func(err error, req *http.Request, resp *http.Response, exprString string) bool { return shouldRetry(err, req, resp) @@ -74,31 +75,28 @@ func TestRetryOnHTTP5xx(t *testing.T) { attemptCount := 0 maxRetries := 3 + subgraphName := "sg" mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { retries++ - }, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg") - - tr := NewRetryHTTPTransport( - &MockTransport{ - handler: func(req *http.Request) (*http.Response, error) { - attemptCount++ - if attemptCount <= maxRetries { - // Return 500 to trigger retry - return &http.Response{ - StatusCode: http.StatusInternalServerError, - }, nil - } - // Finally return success + }, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName) + + tr := NewRetryHTTPTransport(&MockTransport{ + handler: func(req *http.Request) (*http.Response, error) { + attemptCount++ + if attemptCount <= maxRetries { + // Return 500 to trigger retry return &http.Response{ - StatusCode: http.StatusOK, + StatusCode: http.StatusInternalServerError, }, nil - }, + } + // Finally return success + return &http.Response{ + StatusCode: http.StatusOK, + }, nil }, - func(req *http.Request) *zap.Logger { return zap.NewNop() }, - mgr, - ) + }, loggerFunc, mgr, getActiveSubgraph(subgraphName)) - req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") + req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) resp, err := tr.RoundTrip(req) assert.Nil(t, err) @@ -114,29 +112,26 @@ func TestRetryOnErrors(t *testing.T) { attemptCount := 0 maxRetries := 3 + subgraphName := "sg" mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { retries++ - }, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg") - - tr := NewRetryHTTPTransport( - &MockTransport{ - handler: func(req *http.Request) (*http.Response, error) { - attemptCount++ - if attemptCount <= maxRetries { - // Return any error to trigger retry - return nil, errors.New("some network error") - } - // Finally return success - return &http.Response{ - StatusCode: http.StatusOK, - }, nil - }, + }, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName) + + tr := NewRetryHTTPTransport(&MockTransport{ + handler: func(req *http.Request) (*http.Response, error) { + attemptCount++ + if attemptCount <= maxRetries { + // Return any error to trigger retry + return nil, errors.New("some network error") + } + // Finally return success + return &http.Response{ + StatusCode: http.StatusOK, + }, nil }, - func(req *http.Request) *zap.Logger { return zap.NewNop() }, - mgr, - ) + }, loggerFunc, mgr, getActiveSubgraph(subgraphName)) - req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") + req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) resp, err := tr.RoundTrip(req) assert.Nil(t, err) @@ -162,32 +157,29 @@ func TestDoNotRetryWhenShouldRetryReturnsFalse(t *testing.T) { return simpleShouldRetry(err, req, resp) } + subgraphName := "sg" mgr := newTestManager(shouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { retries++ - }, RetryOptions{MaxRetryCount: maxRetryCount, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg") - - tr := NewRetryHTTPTransport( - &MockTransport{ - handler: func(req *http.Request) (*http.Response, error) { - attemptCount++ - switch attemptCount { - case 1: - // First attempt: return retryable error - return nil, errors.New("retryable error") - case 2: - // Second attempt: return retryable status code - return &http.Response{StatusCode: http.StatusInternalServerError}, nil - default: - // Third attempt: return non-retryable error (should stop retrying) - return nil, nonRetryableError - } - }, + }, RetryOptions{MaxRetryCount: maxRetryCount, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName) + + tr := NewRetryHTTPTransport(&MockTransport{ + handler: func(req *http.Request) (*http.Response, error) { + attemptCount++ + switch attemptCount { + case 1: + // First attempt: return retryable error + return nil, errors.New("retryable error") + case 2: + // Second attempt: return retryable status code + return &http.Response{StatusCode: http.StatusInternalServerError}, nil + default: + // Third attempt: return non-retryable error (should stop retrying) + return nil, nonRetryableError + } }, - func(req *http.Request) *zap.Logger { return zap.NewNop() }, - mgr, - ) + }, loggerFunc, mgr, getActiveSubgraph(subgraphName)) - req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") + req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) resp, err := tr.RoundTrip(req) assert.Error(t, err) @@ -232,26 +224,23 @@ func (b *TrackableBody) Close() error { func TestShortCircuitOnSuccess(t *testing.T) { attemptCount := 0 + subgraphName := "sg" mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { t.Error("OnRetry should not be called when first request succeeds") - }, RetryOptions{MaxRetryCount: 5, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg") - - tr := NewRetryHTTPTransport( - &MockTransport{ - handler: func(req *http.Request) (*http.Response, error) { - attemptCount++ - // Always return success - return &http.Response{ - StatusCode: http.StatusOK, - Body: io.NopCloser(strings.NewReader("success")), - }, nil - }, + }, RetryOptions{MaxRetryCount: 5, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName) + + tr := NewRetryHTTPTransport(&MockTransport{ + handler: func(req *http.Request) (*http.Response, error) { + attemptCount++ + // Always return success + return &http.Response{ + StatusCode: http.StatusOK, + Body: io.NopCloser(strings.NewReader("success")), + }, nil }, - func(req *http.Request) *zap.Logger { return zap.NewNop() }, - mgr, - ) + }, loggerFunc, mgr, getActiveSubgraph(subgraphName)) - req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") + req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) resp, err := tr.RoundTrip(req) assert.NoError(t, err) @@ -270,23 +259,20 @@ func TestMaxRetryCountRespected(t *testing.T) { retries := 0 attemptCount := 0 + subgraphName := "sg" mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { retries++ - }, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg") - - tr := NewRetryHTTPTransport( - &MockTransport{ - handler: func(req *http.Request) (*http.Response, error) { - attemptCount++ - // Always return retryable error to test max retry limit - return nil, errors.New("always fail") - }, + }, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName) + + tr := NewRetryHTTPTransport(&MockTransport{ + handler: func(req *http.Request) (*http.Response, error) { + attemptCount++ + // Always return retryable error to test max retry limit + return nil, errors.New("always fail") }, - func(req *http.Request) *zap.Logger { return zap.NewNop() }, - mgr, - ) + }, loggerFunc, mgr, getActiveSubgraph(subgraphName)) - req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") + req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) resp, err := tr.RoundTrip(req) assert.Error(t, err) @@ -311,32 +297,29 @@ func TestResponseBodyDraining(t *testing.T) { } } + subgraphName := "sg" mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { actualRetries++ - }, RetryOptions{MaxRetryCount: retryCount, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg") - - tr := NewRetryHTTPTransport( - &MockTransport{ - handler: func(req *http.Request) (*http.Response, error) { - index++ - if index < retryCount { - return &http.Response{ - StatusCode: http.StatusInternalServerError, - Body: bodies[index], - }, nil - } else { - return &http.Response{ - StatusCode: http.StatusOK, - Body: bodies[index], - }, nil - } - }, + }, RetryOptions{MaxRetryCount: retryCount, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName) + + tr := NewRetryHTTPTransport(&MockTransport{ + handler: func(req *http.Request) (*http.Response, error) { + index++ + if index < retryCount { + return &http.Response{ + StatusCode: http.StatusInternalServerError, + Body: bodies[index], + }, nil + } else { + return &http.Response{ + StatusCode: http.StatusOK, + Body: bodies[index], + }, nil + } }, - func(req *http.Request) *zap.Logger { return zap.NewNop() }, - mgr, - ) + }, loggerFunc, mgr, getActiveSubgraph(subgraphName)) - req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") + req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) resp, err := tr.RoundTrip(req) assert.Nil(t, err) @@ -380,34 +363,32 @@ func TestRequestLoggerIsUsed(t *testing.T) { bodies[i] = trackableBody } + subgraphName := "sg" mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { actualRetries++ - }, RetryOptions{MaxRetryCount: retryCount, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg") - - tr := NewRetryHTTPTransport( - &MockTransport{ - handler: func(req *http.Request) (*http.Response, error) { - index++ - if index < retryCount { - return &http.Response{ - StatusCode: http.StatusInternalServerError, - Body: bodies[index], - }, nil - } else { - return &http.Response{ - StatusCode: http.StatusOK, - Body: bodies[index], - }, nil - } - }, + }, RetryOptions{MaxRetryCount: retryCount, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName) + + tr := NewRetryHTTPTransport(&MockTransport{ + handler: func(req *http.Request) (*http.Response, error) { + index++ + if index < retryCount { + return &http.Response{ + StatusCode: http.StatusInternalServerError, + Body: bodies[index], + }, nil + } else { + return &http.Response{ + StatusCode: http.StatusOK, + Body: bodies[index], + }, nil + } }, - func(req *http.Request) *zap.Logger { return requestLogger }, - mgr, - ) + }, func(req *http.Request) *zap.Logger { return requestLogger }, mgr, getActiveSubgraph(subgraphName)) - req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") + req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) - tr.RoundTrip(req) + _, err := tr.RoundTrip(req) + assert.NoError(t, err) assert.Contains(t, requestLoggerBuf.String(), "Failed draining when discarding the body\t{\"error\": \"retry read error, index: 1\"}") assert.Contains(t, requestLoggerBuf.String(), "Failed draining when closing the body\t{\"error\": \"retry close error, index: 2\"}") @@ -435,6 +416,7 @@ func TestOnRetryCallbackInvoked(t *testing.T) { resp *http.Response } + subgraphName := "sg" mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { retries++ retryCallbacks = append(retryCallbacks, struct { @@ -442,26 +424,22 @@ func TestOnRetryCallbackInvoked(t *testing.T) { err error resp *http.Response }{count: count, err: err, resp: resp}) - }, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg") - - tr := NewRetryHTTPTransport( - &MockTransport{ - handler: func(req *http.Request) (*http.Response, error) { - if retries < maxRetries { - // Return retryable error - return nil, errors.New("retryable error") - } - // Finally return success - return &http.Response{ - StatusCode: http.StatusOK, - }, nil - }, + }, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName) + + tr := NewRetryHTTPTransport(&MockTransport{ + handler: func(req *http.Request) (*http.Response, error) { + if retries < maxRetries { + // Return retryable error + return nil, errors.New("retryable error") + } + // Finally return success + return &http.Response{ + StatusCode: http.StatusOK, + }, nil }, - func(req *http.Request) *zap.Logger { return zap.NewNop() }, - mgr, - ) + }, loggerFunc, mgr, getActiveSubgraph(subgraphName)) - req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") + req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) resp, err := tr.RoundTrip(req) assert.NoError(t, err) @@ -489,41 +467,38 @@ func TestRetryOn429WithDelaySeconds(t *testing.T) { // Track what retry duration was requested to verify Retry-After is parsed correctly var retryAfterUsed []bool + subgraphName := `sg` mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { retries++ - }, RetryOptions{MaxRetryCount: maxRetries, Interval: 100 * time.Millisecond, MaxDuration: 10 * time.Second}, "sg") - - tr := NewRetryHTTPTransport( - &MockTransport{ - handler: func(req *http.Request) (*http.Response, error) { - attemptCount++ - if attemptCount <= maxRetries { - // Return 429 with Retry-After header in seconds - resp := &http.Response{ - StatusCode: http.StatusTooManyRequests, - Header: make(http.Header), - } - resp.Header.Set("Retry-After", fmt.Sprintf("%d", retryAfterSeconds)) - - // Verify the header is parsed correctly - duration, useRetryAfter := shouldUseRetryAfter(zap.NewNop(), resp, defaultMaxDuration) - retryAfterUsed = append(retryAfterUsed, useRetryAfter) - assert.True(t, useRetryAfter, "Should use Retry-After header for 429") - assert.Equal(t, time.Duration(retryAfterSeconds)*time.Second, duration) - - return resp, nil + }, RetryOptions{MaxRetryCount: maxRetries, Interval: 100 * time.Millisecond, MaxDuration: 10 * time.Second}, subgraphName) + + tr := NewRetryHTTPTransport(&MockTransport{ + handler: func(req *http.Request) (*http.Response, error) { + attemptCount++ + if attemptCount <= maxRetries { + // Return 429 with Retry-After header in seconds + resp := &http.Response{ + StatusCode: http.StatusTooManyRequests, + Header: make(http.Header), } - // Finally return success - return &http.Response{ - StatusCode: http.StatusOK, - }, nil - }, + resp.Header.Set("Retry-After", fmt.Sprintf("%d", retryAfterSeconds)) + + // Verify the header is parsed correctly + duration, useRetryAfter := shouldUseRetryAfter(zap.NewNop(), resp, defaultMaxDuration) + retryAfterUsed = append(retryAfterUsed, useRetryAfter) + assert.True(t, useRetryAfter, "Should use Retry-After header for 429") + assert.Equal(t, time.Duration(retryAfterSeconds)*time.Second, duration) + + return resp, nil + } + // Finally return success + return &http.Response{ + StatusCode: http.StatusOK, + }, nil }, - func(req *http.Request) *zap.Logger { return zap.NewNop() }, - mgr, - ) + }, loggerFunc, mgr, getActiveSubgraph(subgraphName)) - req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") + req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) resp, err := tr.RoundTrip(req) assert.NoError(t, err) @@ -547,41 +522,38 @@ func TestRetryOn429WithDelaySecondsLargerThanMaxDuration(t *testing.T) { // Track what retry duration was requested to verify Retry-After is parsed correctly var retryAfterUsed []bool + subgraphName := "sg" mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { retries++ - }, RetryOptions{MaxRetryCount: maxRetries, Interval: 100 * time.Millisecond, MaxDuration: 10 * time.Second}, "sg") - - tr := NewRetryHTTPTransport( - &MockTransport{ - handler: func(req *http.Request) (*http.Response, error) { - attemptCount++ - if attemptCount <= maxRetries { - // Return 429 with Retry-After header in seconds - resp := &http.Response{ - StatusCode: http.StatusTooManyRequests, - Header: make(http.Header), - } - resp.Header.Set("Retry-After", fmt.Sprintf("%d", retryAfterSeconds)) - - // Verify the header is parsed correctly - duration, useRetryAfter := shouldUseRetryAfter(zap.NewNop(), resp, maxDuration) - retryAfterUsed = append(retryAfterUsed, useRetryAfter) - assert.True(t, useRetryAfter, "Should use Retry-After header for 429") - assert.Equal(t, maxDuration, duration) - - return resp, nil + }, RetryOptions{MaxRetryCount: maxRetries, Interval: 100 * time.Millisecond, MaxDuration: 10 * time.Second}, subgraphName) + + tr := NewRetryHTTPTransport(&MockTransport{ + handler: func(req *http.Request) (*http.Response, error) { + attemptCount++ + if attemptCount <= maxRetries { + // Return 429 with Retry-After header in seconds + resp := &http.Response{ + StatusCode: http.StatusTooManyRequests, + Header: make(http.Header), } - // Finally return success - return &http.Response{ - StatusCode: http.StatusOK, - }, nil - }, + resp.Header.Set("Retry-After", fmt.Sprintf("%d", retryAfterSeconds)) + + // Verify the header is parsed correctly + duration, useRetryAfter := shouldUseRetryAfter(zap.NewNop(), resp, maxDuration) + retryAfterUsed = append(retryAfterUsed, useRetryAfter) + assert.True(t, useRetryAfter, "Should use Retry-After header for 429") + assert.Equal(t, maxDuration, duration) + + return resp, nil + } + // Finally return success + return &http.Response{ + StatusCode: http.StatusOK, + }, nil }, - func(req *http.Request) *zap.Logger { return zap.NewNop() }, - mgr, - ) + }, loggerFunc, mgr, getActiveSubgraph(subgraphName)) - req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") + req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) resp, err := tr.RoundTrip(req) assert.NoError(t, err) @@ -600,32 +572,29 @@ func TestRetryOn429WithoutRetryAfter(t *testing.T) { attemptCount := 0 maxRetries := 2 + subgraphName := "sg" mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { retries++ - }, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg") - - tr := NewRetryHTTPTransport( - &MockTransport{ - handler: func(req *http.Request) (*http.Response, error) { - attemptCount++ - if attemptCount <= maxRetries { - // Return 429 without Retry-After header - return &http.Response{ - StatusCode: http.StatusTooManyRequests, - Header: make(http.Header), - }, nil - } - // Finally return success + }, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName) + + tr := NewRetryHTTPTransport(&MockTransport{ + handler: func(req *http.Request) (*http.Response, error) { + attemptCount++ + if attemptCount <= maxRetries { + // Return 429 without Retry-After header return &http.Response{ - StatusCode: http.StatusOK, + StatusCode: http.StatusTooManyRequests, + Header: make(http.Header), }, nil - }, + } + // Finally return success + return &http.Response{ + StatusCode: http.StatusOK, + }, nil }, - func(req *http.Request) *zap.Logger { return zap.NewNop() }, - mgr, - ) + }, loggerFunc, mgr, getActiveSubgraph(subgraphName)) - req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") + req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) resp, err := tr.RoundTrip(req) assert.NoError(t, err) @@ -644,45 +613,42 @@ func TestRetryOn429WithHTTPDate(t *testing.T) { var retryAfterUsed []bool var expectedDuration time.Duration + subgraphName := "sg" mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { retries++ - }, RetryOptions{MaxRetryCount: maxRetries, Interval: 100 * time.Millisecond, MaxDuration: 10 * time.Second}, "sg") - - tr := NewRetryHTTPTransport( - &MockTransport{ - handler: func(req *http.Request) (*http.Response, error) { - attemptCount++ - if attemptCount <= maxRetries { - // Return 429 with Retry-After header as HTTP-date (1 second in future to keep test fast) - expectedDuration = 1 * time.Second - futureTime := time.Now().UTC().Add(expectedDuration) - resp := &http.Response{ - StatusCode: http.StatusTooManyRequests, - Header: make(http.Header), - } - resp.Header.Set("Retry-After", futureTime.Format(http.TimeFormat)) - - // Verify the header is parsed correctly - duration, useRetryAfter := shouldUseRetryAfter(zap.NewNop(), resp, defaultMaxDuration) - retryAfterUsed = append(retryAfterUsed, useRetryAfter) - assert.True(t, useRetryAfter, "Should use Retry-After header for 429") - // Allow reasonable tolerance for execution delay between time creation and parsing - assert.True(t, duration > 0 && duration <= expectedDuration, - "Duration should be positive and <= %v, got %v", expectedDuration, duration) - - return resp, nil + }, RetryOptions{MaxRetryCount: maxRetries, Interval: 100 * time.Millisecond, MaxDuration: 10 * time.Second}, subgraphName) + + tr := NewRetryHTTPTransport(&MockTransport{ + handler: func(req *http.Request) (*http.Response, error) { + attemptCount++ + if attemptCount <= maxRetries { + // Return 429 with Retry-After header as HTTP-date (1 second in future to keep test fast) + expectedDuration = 1 * time.Second + futureTime := time.Now().UTC().Add(expectedDuration) + resp := &http.Response{ + StatusCode: http.StatusTooManyRequests, + Header: make(http.Header), } - // Finally return success - return &http.Response{ - StatusCode: http.StatusOK, - }, nil - }, + resp.Header.Set("Retry-After", futureTime.Format(http.TimeFormat)) + + // Verify the header is parsed correctly + duration, useRetryAfter := shouldUseRetryAfter(zap.NewNop(), resp, defaultMaxDuration) + retryAfterUsed = append(retryAfterUsed, useRetryAfter) + assert.True(t, useRetryAfter, "Should use Retry-After header for 429") + // Allow reasonable tolerance for execution delay between time creation and parsing + assert.True(t, duration > 0 && duration <= expectedDuration, + "Duration should be positive and <= %v, got %v", expectedDuration, duration) + + return resp, nil + } + // Finally return success + return &http.Response{ + StatusCode: http.StatusOK, + }, nil }, - func(req *http.Request) *zap.Logger { return zap.NewNop() }, - mgr, - ) + }, loggerFunc, mgr, getActiveSubgraph(subgraphName)) - req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") + req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) resp, err := tr.RoundTrip(req) assert.NoError(t, err) @@ -701,34 +667,31 @@ func TestRetryOn429WithInvalidRetryAfterHeader(t *testing.T) { attemptCount := 0 maxRetries := 2 + subgraphName := "sg" mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { retries++ - }, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg") - - tr := NewRetryHTTPTransport( - &MockTransport{ - handler: func(req *http.Request) (*http.Response, error) { - attemptCount++ - if attemptCount <= maxRetries { - // Return 429 with invalid Retry-After header - resp := &http.Response{ - StatusCode: http.StatusTooManyRequests, - Header: make(http.Header), - } - resp.Header.Set("Retry-After", "invalid-value") - return resp, nil + }, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName) + + tr := NewRetryHTTPTransport(&MockTransport{ + handler: func(req *http.Request) (*http.Response, error) { + attemptCount++ + if attemptCount <= maxRetries { + // Return 429 with invalid Retry-After header + resp := &http.Response{ + StatusCode: http.StatusTooManyRequests, + Header: make(http.Header), } - // Finally return success - return &http.Response{ - StatusCode: http.StatusOK, - }, nil - }, + resp.Header.Set("Retry-After", "invalid-value") + return resp, nil + } + // Finally return success + return &http.Response{ + StatusCode: http.StatusOK, + }, nil }, - func(req *http.Request) *zap.Logger { return zap.NewNop() }, - mgr, - ) + }, loggerFunc, mgr, getActiveSubgraph(subgraphName)) - req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") + req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) resp, err := tr.RoundTrip(req) assert.NoError(t, err) @@ -744,34 +707,31 @@ func TestRetryOn429WithNegativeDelaySeconds(t *testing.T) { attemptCount := 0 maxRetries := 2 + subgraphName := "sg" mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { retries++ - }, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg") - - tr := NewRetryHTTPTransport( - &MockTransport{ - handler: func(req *http.Request) (*http.Response, error) { - attemptCount++ - if attemptCount <= maxRetries { - // Return 429 with negative Retry-After value (should fall back to normal backoff) - resp := &http.Response{ - StatusCode: http.StatusTooManyRequests, - Header: make(http.Header), - } - resp.Header.Set("Retry-After", "-1") - return resp, nil + }, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName) + + tr := NewRetryHTTPTransport(&MockTransport{ + handler: func(req *http.Request) (*http.Response, error) { + attemptCount++ + if attemptCount <= maxRetries { + // Return 429 with negative Retry-After value (should fall back to normal backoff) + resp := &http.Response{ + StatusCode: http.StatusTooManyRequests, + Header: make(http.Header), } - // Finally return success - return &http.Response{ - StatusCode: http.StatusOK, - }, nil - }, + resp.Header.Set("Retry-After", "-1") + return resp, nil + } + // Finally return success + return &http.Response{ + StatusCode: http.StatusOK, + }, nil }, - func(req *http.Request) *zap.Logger { return zap.NewNop() }, - mgr, - ) + }, loggerFunc, mgr, getActiveSubgraph(subgraphName)) - req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") + req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) resp, err := tr.RoundTrip(req) assert.NoError(t, err) @@ -789,62 +749,59 @@ func TestRetryMixed429AndOtherErrors(t *testing.T) { // Track which responses used Retry-After vs normal backoff var retryAfterUsedPerAttempt []bool + subgraphName := "sg" mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { retries++ - }, RetryOptions{MaxRetryCount: maxRetries, Interval: 10 * time.Millisecond, MaxDuration: 10 * time.Second}, "sg") - - tr := NewRetryHTTPTransport( - &MockTransport{ - handler: func(req *http.Request) (*http.Response, error) { - attemptCount++ - switch attemptCount { - case 1: - // First: 429 with Retry-After - resp := &http.Response{ - StatusCode: http.StatusTooManyRequests, - Header: make(http.Header), - } - resp.Header.Set("Retry-After", "1") - - // Verify this should use Retry-After - _, useRetryAfter := shouldUseRetryAfter(zap.NewNop(), resp, defaultMaxDuration) - retryAfterUsedPerAttempt = append(retryAfterUsedPerAttempt, useRetryAfter) - - return resp, nil - case 2: - // Second: Network error (should use normal backoff) - retryAfterUsedPerAttempt = append(retryAfterUsedPerAttempt, false) - return nil, errors.New("network error") - case 3: - // Third: 500 error (should use normal backoff) - resp := &http.Response{ - StatusCode: http.StatusInternalServerError, - } - _, useRetryAfter := shouldUseRetryAfter(zap.NewNop(), resp, defaultMaxDuration) - retryAfterUsedPerAttempt = append(retryAfterUsedPerAttempt, useRetryAfter) - return resp, nil - case 4: - // Fourth: 429 without Retry-After (should use normal backoff) - resp := &http.Response{ - StatusCode: http.StatusTooManyRequests, - Header: make(http.Header), - } - _, useRetryAfter := shouldUseRetryAfter(zap.NewNop(), resp, defaultMaxDuration) - retryAfterUsedPerAttempt = append(retryAfterUsedPerAttempt, useRetryAfter) - return resp, nil - default: - // Finally: Success - return &http.Response{ - StatusCode: http.StatusOK, - }, nil + }, RetryOptions{MaxRetryCount: maxRetries, Interval: 10 * time.Millisecond, MaxDuration: 10 * time.Second}, subgraphName) + + tr := NewRetryHTTPTransport(&MockTransport{ + handler: func(req *http.Request) (*http.Response, error) { + attemptCount++ + switch attemptCount { + case 1: + // First: 429 with Retry-After + resp := &http.Response{ + StatusCode: http.StatusTooManyRequests, + Header: make(http.Header), } - }, + resp.Header.Set("Retry-After", "1") + + // Verify this should use Retry-After + _, useRetryAfter := shouldUseRetryAfter(zap.NewNop(), resp, defaultMaxDuration) + retryAfterUsedPerAttempt = append(retryAfterUsedPerAttempt, useRetryAfter) + + return resp, nil + case 2: + // Second: Network error (should use normal backoff) + retryAfterUsedPerAttempt = append(retryAfterUsedPerAttempt, false) + return nil, errors.New("network error") + case 3: + // Third: 500 error (should use normal backoff) + resp := &http.Response{ + StatusCode: http.StatusInternalServerError, + } + _, useRetryAfter := shouldUseRetryAfter(zap.NewNop(), resp, defaultMaxDuration) + retryAfterUsedPerAttempt = append(retryAfterUsedPerAttempt, useRetryAfter) + return resp, nil + case 4: + // Fourth: 429 without Retry-After (should use normal backoff) + resp := &http.Response{ + StatusCode: http.StatusTooManyRequests, + Header: make(http.Header), + } + _, useRetryAfter := shouldUseRetryAfter(zap.NewNop(), resp, defaultMaxDuration) + retryAfterUsedPerAttempt = append(retryAfterUsedPerAttempt, useRetryAfter) + return resp, nil + default: + // Finally: Success + return &http.Response{ + StatusCode: http.StatusOK, + }, nil + } }, - func(req *http.Request) *zap.Logger { return zap.NewNop() }, - mgr, - ) + }, loggerFunc, mgr, getActiveSubgraph(subgraphName)) - req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") + req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) resp, err := tr.RoundTrip(req) assert.NoError(t, err) @@ -873,28 +830,25 @@ func TestNoRetryOn429WhenShouldRetryReturnsFalse(t *testing.T) { return err != nil } + subgraphName := `sg` mgr := newTestManager(shouldNotRetry429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { retries++ - }, RetryOptions{MaxRetryCount: 3, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg") + }, RetryOptions{MaxRetryCount: 3, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName) - tr := NewRetryHTTPTransport( - &MockTransport{ - handler: func(req *http.Request) (*http.Response, error) { - attemptCount++ - // Always return 429 with Retry-After header - resp := &http.Response{ - StatusCode: http.StatusTooManyRequests, - Header: make(http.Header), - } - resp.Header.Set("Retry-After", "1") - return resp, nil - }, + tr := NewRetryHTTPTransport(&MockTransport{ + handler: func(req *http.Request) (*http.Response, error) { + attemptCount++ + // Always return 429 with Retry-After header + resp := &http.Response{ + StatusCode: http.StatusTooManyRequests, + Header: make(http.Header), + } + resp.Header.Set("Retry-After", "1") + return resp, nil }, - func(req *http.Request) *zap.Logger { return zap.NewNop() }, - mgr, - ) + }, loggerFunc, mgr, getActiveSubgraph(subgraphName)) - req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg") + req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil) resp, err := tr.RoundTrip(req) assert.NoError(t, err) diff --git a/router/internal/traceclient/traceclient.go b/router/internal/traceclient/traceclient.go index d787473330..3207e3697f 100644 --- a/router/internal/traceclient/traceclient.go +++ b/router/internal/traceclient/traceclient.go @@ -6,7 +6,6 @@ import ( "net/http/httptrace" "time" - rcontext "github.com/wundergraph/cosmo/router/internal/context" "github.com/wundergraph/cosmo/router/internal/expr" "github.com/wundergraph/cosmo/router/pkg/metric" @@ -33,20 +32,23 @@ type ClientTrace struct { type ClientTraceContextKey struct{} type TraceInjectingRoundTripper struct { - base http.RoundTripper - connectionMetricStore metric.ConnectionMetricStore - reqContextValuesGetter func(ctx context.Context, req *http.Request) (*expr.Context, string) + base http.RoundTripper + connectionMetricStore metric.ConnectionMetricStore + getExprContext func(ctx context.Context, req *http.Request) *expr.Context + getActiveSubgraphName func(req *http.Request) string } func NewTraceInjectingRoundTripper( base http.RoundTripper, connectionMetricStore metric.ConnectionMetricStore, - reqContextValuesGetter func(ctx context.Context, req *http.Request) (*expr.Context, string), + getExprContext func(ctx context.Context, req *http.Request) *expr.Context, + getActiveSubgraphName func(req *http.Request) string, ) *TraceInjectingRoundTripper { return &TraceInjectingRoundTripper{ - base: base, - connectionMetricStore: connectionMetricStore, - reqContextValuesGetter: reqContextValuesGetter, + base: base, + connectionMetricStore: connectionMetricStore, + getExprContext: getExprContext, + getActiveSubgraphName: getActiveSubgraphName, } } @@ -104,23 +106,14 @@ func (t *TraceInjectingRoundTripper) getClientTrace(ctx context.Context) *httptr func (t *TraceInjectingRoundTripper) processConnectionMetrics(ctx context.Context, req *http.Request) { trace := GetClientTraceFromContext(ctx) - var subgraph string - subgraphCtxVal := ctx.Value(rcontext.CurrentSubgraphContextKey{}) - if subgraphCtxVal != nil { - subgraph = subgraphCtxVal.(string) - } - - // We have a fallback for active subgraph name in case engine loader hooks is not called - // TODO: Evaluate if we actually need a fallback and if we can use only one way to get the active subgraph name - exprContext, activeSubgraphName := t.reqContextValuesGetter(ctx, req) - if subgraph == "" { - subgraph = activeSubgraphName - } + exprContext := t.getExprContext(ctx, req) if trace.ConnectionGet != nil && trace.ConnectionAcquired != nil { duration := trace.ConnectionAcquired.Time.Sub(trace.ConnectionGet.Time) exprContext.Subgraph.Request.ClientTrace.ConnectionAcquireDuration = duration + subgraph := t.getActiveSubgraphName(req) + serverAttributes := rotel.GetServerAttributes(trace.ConnectionGet.HostPort) serverAttributes = append( serverAttributes, From bceee7efe357a6ffd07c87a2c20435a224cab526 Mon Sep 17 00:00:00 2001 From: Milinda Dias Date: Thu, 18 Sep 2025 01:36:01 +0530 Subject: [PATCH 14/24] fix: feature flags --- router-tests/retry_test.go | 49 +++++++++++++++++++ router/core/graph_server.go | 2 +- router/internal/retrytransport/manager.go | 33 ++++++++----- .../retrytransport/retry_transport.go | 1 - 4 files changed, 71 insertions(+), 14 deletions(-) diff --git a/router-tests/retry_test.go b/router-tests/retry_test.go index d52c3ef1ab..ae0186ab65 100644 --- a/router-tests/retry_test.go +++ b/router-tests/retry_test.go @@ -928,6 +928,55 @@ func TestRetryPerSubgraph(t *testing.T) { }) }) + t.Run("verify retries are applied when only all and no subgraph specific overrides are present", func(t *testing.T) { + t.Parallel() + + calls := atomic.Int32{} + + // Configure per-subgraph retry: employees gets 3 retries, test1 gets 1 retry + generalMax := 5 + opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{ + All: config.GlobalSubgraphRequestRule{ + BackoffJitterRetry: config.BackoffJitterRetry{ + Enabled: true, + MaxAttempts: generalMax, + MaxDuration: 2 * time.Second, + Interval: 10 * time.Millisecond, + Expression: "true", + }, + }, + }) + options := core.WithSubgraphRetryOptions(opts) + + testenv.Run(t, &testenv.Config{ + NoRetryClient: true, + RouterOptions: []core.Option{ + options, + }, + Subgraphs: testenv.SubgraphsConfig{ + ProductsFg: testenv.SubgraphConfig{ + Middleware: func(_ http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusBadGateway) + calls.Add(1) + }) + }, + }, + }, + }, func(t *testing.T, xEnv *testenv.Environment) { + res, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{ + Query: `query { employees { id products } }`, + Header: map[string][]string{ + "X-Feature-Flag": {"myff"}, + }, + }) + require.NoError(t, err) + require.Equal(t, res.Response.Header.Get("X-Feature-Flag"), "myff") + require.Contains(t, res.Body, `{"message":"Failed to fetch from Subgraph 'products_fg' at Path 'employees', Reason: empty response.","extensions":{"statusCode":502}}`) + require.Equal(t, int32(generalMax+1), calls.Load()) + }) + }) + t.Run("verify retry on upgrade requests", func(t *testing.T) { t.Parallel() diff --git a/router/core/graph_server.go b/router/core/graph_server.go index 99fdc21c97..44af5e43a3 100644 --- a/router/core/graph_server.go +++ b/router/core/graph_server.go @@ -276,7 +276,7 @@ func newGraphServer(ctx context.Context, r *Router, routerConfig *nodev1.RouterC err = retryManager.Initialize( s.retryOptions.All, s.retryOptions.SubgraphMap, - routerConfig.GetSubgraphs(), + routerConfig, ) if err != nil { return nil, err diff --git a/router/internal/retrytransport/manager.go b/router/internal/retrytransport/manager.go index 1c7fdff1ad..9ca0555c3f 100644 --- a/router/internal/retrytransport/manager.go +++ b/router/internal/retrytransport/manager.go @@ -47,21 +47,30 @@ func NewManager(exprManager *expr.RetryExpressionManager, retryFunc ShouldRetryF } } -func (m *Manager) Initialize( - baseRetryOptions RetryOptions, - subgraphRetryOptions map[string]RetryOptions, - subgraphs []*nodev1.Subgraph, -) error { - defaultSgNames := make([]string, 0, len(subgraphs)) - customSgNames := make([]string, 0, len(subgraphs)) - - for _, subgraph := range subgraphs { - entry, ok := subgraphRetryOptions[subgraph.Name] +func (m *Manager) Initialize(baseRetryOptions RetryOptions, subgraphRetryOptions map[string]RetryOptions, routerConfig *nodev1.RouterConfig) error { + // Get the list of all subgraph AND feature subgraphs + subgraphNameSet := make(map[string]bool, len(routerConfig.Subgraphs)) + for _, subgraph := range routerConfig.GetSubgraphs() { + subgraphNameSet[subgraph.Name] = true + } + if routerConfig.FeatureFlagConfigs != nil { + for _, ffConfig := range routerConfig.FeatureFlagConfigs.ConfigByFeatureFlagName { + for _, subgraph := range ffConfig.GetSubgraphs() { + subgraphNameSet[subgraph.Name] = true + } + } + } + + defaultSgNames := make([]string, 0, len(subgraphNameSet)) + customSgNames := make([]string, 0, len(subgraphNameSet)) + + for subgraphName := range subgraphNameSet { + entry, ok := subgraphRetryOptions[subgraphName] if !ok { - defaultSgNames = append(defaultSgNames, subgraph.Name) + defaultSgNames = append(defaultSgNames, subgraphName) } else if entry.Enabled { // This will cover the case of if a subgraph is explicitly disabled - customSgNames = append(customSgNames, subgraph.Name) + customSgNames = append(customSgNames, subgraphName) } } diff --git a/router/internal/retrytransport/retry_transport.go b/router/internal/retrytransport/retry_transport.go index 26fcee92e3..505eca2e28 100644 --- a/router/internal/retrytransport/retry_transport.go +++ b/router/internal/retrytransport/retry_transport.go @@ -105,7 +105,6 @@ func (rt *RetryHTTPTransport) RoundTrip(req *http.Request) (*http.Response, erro // Retry logic retries := 0 for (rt.retryManager.Retry(err, req, resp, retryOptions.Expression)) && retries < retryOptions.MaxRetryCount { - retries++ // Check if we should use Retry-After header for 429 responses From 3131bb710298cb40ab56dff597c61c6f4e4f16e8 Mon Sep 17 00:00:00 2001 From: Milinda Dias Date: Thu, 18 Sep 2025 01:54:00 +0530 Subject: [PATCH 15/24] fix: linting --- router-tests/circuit_breaker_test.go | 6 +-- router-tests/retry_test.go | 70 ++++++++++++++-------------- 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/router-tests/circuit_breaker_test.go b/router-tests/circuit_breaker_test.go index 2ee4ef21c0..b6d83f59b1 100644 --- a/router-tests/circuit_breaker_test.go +++ b/router-tests/circuit_breaker_test.go @@ -658,7 +658,7 @@ func TestCircuitBreaker(t *testing.T) { }, Subgraphs: testenv.SubgraphsConfig{ Employees: testenv.SubgraphConfig{ - Middleware: func(handler http.Handler) http.Handler { + Middleware: func(_ http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { employeesCalls.Add(1) if employeesCalls.Load() <= failedTries { @@ -667,7 +667,7 @@ func TestCircuitBreaker(t *testing.T) { } upgrader := websocket.Upgrader{ - CheckOrigin: func(r *http.Request) bool { + CheckOrigin: func(_ *http.Request) bool { return true }, Subprotocols: []string{"graphql-transport-ws"}, @@ -706,7 +706,7 @@ func TestCircuitBreaker(t *testing.T) { _, message, err := testenv.WSReadMessage(t, conn) require.NoError(t, err) - require.Equal(t, defaultErrorMessage, string(message)) + require.JSONEq(t, defaultErrorMessage, string(message)) require.NoError(t, conn.Close()) switch { diff --git a/router-tests/retry_test.go b/router-tests/retry_test.go index ae0186ab65..a679a643db 100644 --- a/router-tests/retry_test.go +++ b/router-tests/retry_test.go @@ -524,7 +524,7 @@ func TestRetryPerSubgraph(t *testing.T) { RouterOptions: []core.Option{ options, }, - }, func(t *testing.T, xEnv *testenv.Environment) { + }, func(t *testing.T, _ *testenv.Environment) { require.Fail(t, "expected initialization to fail due to invalid algorithm") }) @@ -553,7 +553,7 @@ func TestRetryPerSubgraph(t *testing.T) { RouterOptions: []core.Option{ options, }, - }, func(t *testing.T, xEnv *testenv.Environment) {}) + }, func(_ *testing.T, _ *testenv.Environment) {}) require.NoError(t, err) }) @@ -582,7 +582,7 @@ func TestRetryPerSubgraph(t *testing.T) { RouterOptions: []core.Option{ options, }, - }, func(t *testing.T, xEnv *testenv.Environment) {}) + }, func(_ *testing.T, _ *testenv.Environment) {}) require.NoError(t, err) }) @@ -611,7 +611,7 @@ func TestRetryPerSubgraph(t *testing.T) { RouterOptions: []core.Option{ options, }, - }, func(t *testing.T, xEnv *testenv.Environment) { + }, func(t *testing.T, _ *testenv.Environment) { require.Fail(t, "expected initialization to fail due to invalid algorithm") }) @@ -639,7 +639,7 @@ func TestRetryPerSubgraph(t *testing.T) { RouterOptions: []core.Option{ options, }, - }, func(t *testing.T, xEnv *testenv.Environment) { + }, func(t *testing.T, _ *testenv.Environment) { require.Fail(t, "expected initialization to fail due to invalid algorithm") }) @@ -669,7 +669,7 @@ func TestRetryPerSubgraph(t *testing.T) { RouterOptions: []core.Option{ options, }, - }, func(t *testing.T, xEnv *testenv.Environment) { + }, func(t *testing.T, _ *testenv.Environment) { require.Fail(t, "expected initialization to fail due to invalid algorithm") }) @@ -700,7 +700,7 @@ func TestRetryPerSubgraph(t *testing.T) { RouterOptions: []core.Option{ options, }, - }, func(t *testing.T, xEnv *testenv.Environment) { + }, func(_ *testing.T, _ *testenv.Environment) { }) require.NoError(t, err) @@ -754,8 +754,8 @@ func TestRetryPerSubgraph(t *testing.T) { }, }, Test1: testenv.SubgraphConfig{ - Middleware: func(handler http.Handler) http.Handler { - return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + Middleware: func(_ http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusBadGateway) test1Calls.Add(1) }) @@ -768,9 +768,9 @@ func TestRetryPerSubgraph(t *testing.T) { Query: `query employees { employees { id } }`, }) require.NoError(t, err) - require.Equal(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'employees', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"employees":null}}`, resEmp.Body) - require.Equal(t, int32(employeesMax+1), employeesCalls.Load()) - require.Equal(t, int32(0), test1Calls.Load()) + require.JSONEq(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'employees', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"employees":null}}`, resEmp.Body) + require.Equal(t, employeesMax+1, int(employeesCalls.Load())) + require.Equal(t, 0, int(test1Calls.Load())) // 2) Call test1-only query; expect test subgraph to be retried test1Max times (attempts = retries + 1) resTest1, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{ @@ -778,9 +778,9 @@ func TestRetryPerSubgraph(t *testing.T) { }) require.NoError(t, err) - require.Equal(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'test1', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"floatField":null}}`, resTest1.Body) - require.Equal(t, int32(employeesMax+1), employeesCalls.Load()) - require.Equal(t, int32(test1Max+1), test1Calls.Load()) + require.JSONEq(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'test1', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"floatField":null}}`, resTest1.Body) + require.Equal(t, employeesMax+1, int(employeesCalls.Load())) + require.Equal(t, test1Max+1, int(test1Calls.Load())) }) }) @@ -832,8 +832,8 @@ func TestRetryPerSubgraph(t *testing.T) { }, }, Test1: testenv.SubgraphConfig{ - Middleware: func(handler http.Handler) http.Handler { - return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + Middleware: func(_ http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusBadGateway) test1Calls.Add(1) }) @@ -846,9 +846,9 @@ func TestRetryPerSubgraph(t *testing.T) { Query: `query employees { employees { id } }`, }) require.NoError(t, err) - require.Equal(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'employees', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"employees":null}}`, resEmp.Body) - require.Equal(t, int32(employeesMax+1), employeesCalls.Load()) - require.Equal(t, int32(0), test1Calls.Load()) + require.JSONEq(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'employees', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"employees":null}}`, resEmp.Body) + require.Equal(t, employeesMax+1, int(employeesCalls.Load())) + require.Equal(t, 0, int(test1Calls.Load())) // 2) Call test1-only query; expect test subgraph to be retried test1Max times (attempts = retries + 1) resTest1, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{ @@ -856,9 +856,9 @@ func TestRetryPerSubgraph(t *testing.T) { }) require.NoError(t, err) - require.Equal(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'test1', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"floatField":null}}`, resTest1.Body) - require.Equal(t, int32(employeesMax+1), employeesCalls.Load()) - require.Equal(t, int32(test1Max+1), test1Calls.Load()) + require.JSONEq(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'test1', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"floatField":null}}`, resTest1.Body) + require.Equal(t, employeesMax+1, int(employeesCalls.Load())) + require.Equal(t, test1Max+1, int(test1Calls.Load())) }) }) @@ -898,8 +898,8 @@ func TestRetryPerSubgraph(t *testing.T) { }, }, Test1: testenv.SubgraphConfig{ - Middleware: func(handler http.Handler) http.Handler { - return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + Middleware: func(_ http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusBadGateway) test1Calls.Add(1) }) @@ -912,9 +912,9 @@ func TestRetryPerSubgraph(t *testing.T) { Query: `query employees { employees { id } }`, }) require.NoError(t, err) - require.Equal(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'employees', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"employees":null}}`, resEmp.Body) - require.Equal(t, int32(generalMax+1), employeesCalls.Load()) - require.Equal(t, int32(0), test1Calls.Load()) + require.JSONEq(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'employees', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"employees":null}}`, resEmp.Body) + require.Equal(t, generalMax+1, int(employeesCalls.Load())) + require.Equal(t, 0, int(test1Calls.Load())) // 2) Call test1-only query; expect test subgraph to be retried test1Max times (attempts = retries + 1) resTest1, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{ @@ -922,9 +922,9 @@ func TestRetryPerSubgraph(t *testing.T) { }) require.NoError(t, err) - require.Equal(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'test1', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"floatField":null}}`, resTest1.Body) - require.Equal(t, int32(generalMax+1), employeesCalls.Load()) - require.Equal(t, int32(generalMax+1), test1Calls.Load()) + require.JSONEq(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'test1', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"floatField":null}}`, resTest1.Body) + require.Equal(t, generalMax+1, int(employeesCalls.Load())) + require.Equal(t, generalMax+1, int(test1Calls.Load())) }) }) @@ -971,9 +971,9 @@ func TestRetryPerSubgraph(t *testing.T) { }, }) require.NoError(t, err) - require.Equal(t, res.Response.Header.Get("X-Feature-Flag"), "myff") + require.Equal(t, "myff", res.Response.Header.Get("X-Feature-Flag")) require.Contains(t, res.Body, `{"message":"Failed to fetch from Subgraph 'products_fg' at Path 'employees', Reason: empty response.","extensions":{"statusCode":502}}`) - require.Equal(t, int32(generalMax+1), calls.Load()) + require.Equal(t, generalMax+1, int(calls.Load())) }) }) @@ -1007,7 +1007,7 @@ func TestRetryPerSubgraph(t *testing.T) { }, Subgraphs: testenv.SubgraphsConfig{ Employees: testenv.SubgraphConfig{ - Middleware: func(handler http.Handler) http.Handler { + Middleware: func(_ http.Handler) http.Handler { return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { employeesCalls.Add(1) if employeesCalls.Load() <= failedTries { @@ -1016,7 +1016,7 @@ func TestRetryPerSubgraph(t *testing.T) { } upgrader := websocket.Upgrader{ - CheckOrigin: func(r *http.Request) bool { + CheckOrigin: func(_ *http.Request) bool { return true }, Subprotocols: []string{"graphql-transport-ws"}, From f3531690a8925716e825fb0d7a93ededadd14df9 Mon Sep 17 00:00:00 2001 From: Milinda Dias Date: Thu, 18 Sep 2025 02:08:02 +0530 Subject: [PATCH 16/24] fix: linting --- router/core/transport.go | 2 +- router/internal/expr/retry_expression.go | 6 +- router/internal/retrytransport/manager.go | 22 +++--- .../retrytransport/retry_transport_test.go | 76 +++++++++---------- router/internal/traceclient/traceclient.go | 8 +- 5 files changed, 57 insertions(+), 57 deletions(-) diff --git a/router/core/transport.go b/router/core/transport.go index 2211936ba6..fd3bc7f251 100644 --- a/router/core/transport.go +++ b/router/core/transport.go @@ -103,7 +103,7 @@ func NewCustomTransport( } if enableTraceClient { - getExprContext := func(ctx context.Context, req *http.Request) *expr.Context { + getExprContext := func(ctx context.Context) *expr.Context { reqContext := getRequestContext(ctx) if reqContext == nil { return &expr.Context{} diff --git a/router/internal/expr/retry_expression.go b/router/internal/expr/retry_expression.go index 0d49664dd6..c925bb87dd 100644 --- a/router/internal/expr/retry_expression.go +++ b/router/internal/expr/retry_expression.go @@ -22,14 +22,14 @@ func NewRetryExpressionManager() *RetryExpressionManager { } } -func (c *RetryExpressionManager) AddExpression(exprString string) error { +func (m *RetryExpressionManager) AddExpression(exprString string) error { expression := exprString if expression == "" { expression = defaultRetryExpression } // The expression has already been processed, skip recompilation - if _, ok := c.expressionMap[expression]; ok { + if _, ok := m.expressionMap[expression]; ok { return nil } @@ -45,7 +45,7 @@ func (c *RetryExpressionManager) AddExpression(exprString string) error { } // Use the normalized expression string as the key for deduplication - c.expressionMap[expression] = program + m.expressionMap[expression] = program return nil } diff --git a/router/internal/retrytransport/manager.go b/router/internal/retrytransport/manager.go index 9ca0555c3f..0b140a4cca 100644 --- a/router/internal/retrytransport/manager.go +++ b/router/internal/retrytransport/manager.go @@ -78,18 +78,18 @@ func (m *Manager) Initialize(baseRetryOptions RetryOptions, subgraphRetryOptions if len(defaultSgNames) > 0 && baseRetryOptions.Enabled { if baseRetryOptions.Algorithm != BackoffJitter { return fmt.Errorf("unsupported retry algorithm: %s", baseRetryOptions.Algorithm) - } else { - err := m.exprManager.AddExpression(baseRetryOptions.Expression) - if err != nil { - return fmt.Errorf("failed to add base retry expression: %w", err) - } else { - // Only assign default options if validation succeeds - for _, sgName := range defaultSgNames { - opts := baseRetryOptions - m.retries[sgName] = &opts - } - } } + + err := m.exprManager.AddExpression(baseRetryOptions.Expression) + if err != nil { + return fmt.Errorf("failed to add base retry expression: %w", err) + } + // Only assign default options if validation succeeds + for _, sgName := range defaultSgNames { + opts := baseRetryOptions + m.retries[sgName] = &opts + } + } // Process custom retry options diff --git a/router/internal/retrytransport/retry_transport_test.go b/router/internal/retrytransport/retry_transport_test.go index e58a1228f2..54ca3b7b02 100644 --- a/router/internal/retrytransport/retry_transport_test.go +++ b/router/internal/retrytransport/retry_transport_test.go @@ -7,6 +7,7 @@ import ( "io" "net/http" "net/http/httptest" + "strconv" "strings" "testing" "time" @@ -21,10 +22,10 @@ import ( const defaultMaxDuration = 100 * time.Second -var loggerFunc = func(req *http.Request) *zap.Logger { return zap.NewNop() } +var loggerFunc = func(_ *http.Request) *zap.Logger { return zap.NewNop() } -var getActiveSubgraph = func(subgraphName string) func(req *http.Request) string { - return func(req *http.Request) string { +var getActiveSubgraph = func(subgraphName string) func(_ *http.Request) string { + return func(_ *http.Request) string { return subgraphName } } @@ -62,7 +63,7 @@ func (dt *MockTransport) RoundTrip(req *http.Request) (*http.Response, error) { } func newTestManager(shouldRetry func(error, *http.Request, *http.Response) bool, onRetry OnRetryFunc, opts RetryOptions, subgraphName string) *Manager { - mgr := NewManager(expr.NewRetryExpressionManager(), func(err error, req *http.Request, resp *http.Response, exprString string) bool { + mgr := NewManager(expr.NewRetryExpressionManager(), func(err error, req *http.Request, resp *http.Response, _ string) bool { return shouldRetry(err, req, resp) }, onRetry) // attach options for the subgraph @@ -76,12 +77,12 @@ func TestRetryOnHTTP5xx(t *testing.T) { maxRetries := 3 subgraphName := "sg" - mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + mgr := newTestManager(simpleShouldRetry, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) { retries++ }, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName) tr := NewRetryHTTPTransport(&MockTransport{ - handler: func(req *http.Request) (*http.Response, error) { + handler: func(_ *http.Request) (*http.Response, error) { attemptCount++ if attemptCount <= maxRetries { // Return 500 to trigger retry @@ -113,12 +114,12 @@ func TestRetryOnErrors(t *testing.T) { maxRetries := 3 subgraphName := "sg" - mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + mgr := newTestManager(simpleShouldRetry, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) { retries++ }, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName) tr := NewRetryHTTPTransport(&MockTransport{ - handler: func(req *http.Request) (*http.Response, error) { + handler: func(_ *http.Request) (*http.Response, error) { attemptCount++ if attemptCount <= maxRetries { // Return any error to trigger retry @@ -158,12 +159,12 @@ func TestDoNotRetryWhenShouldRetryReturnsFalse(t *testing.T) { } subgraphName := "sg" - mgr := newTestManager(shouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + mgr := newTestManager(shouldRetry, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) { retries++ }, RetryOptions{MaxRetryCount: maxRetryCount, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName) tr := NewRetryHTTPTransport(&MockTransport{ - handler: func(req *http.Request) (*http.Response, error) { + handler: func(_ *http.Request) (*http.Response, error) { attemptCount++ switch attemptCount { case 1: @@ -225,12 +226,12 @@ func TestShortCircuitOnSuccess(t *testing.T) { attemptCount := 0 subgraphName := "sg" - mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + mgr := newTestManager(simpleShouldRetry, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) { t.Error("OnRetry should not be called when first request succeeds") }, RetryOptions{MaxRetryCount: 5, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName) tr := NewRetryHTTPTransport(&MockTransport{ - handler: func(req *http.Request) (*http.Response, error) { + handler: func(_ *http.Request) (*http.Response, error) { attemptCount++ // Always return success return &http.Response{ @@ -260,12 +261,12 @@ func TestMaxRetryCountRespected(t *testing.T) { attemptCount := 0 subgraphName := "sg" - mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + mgr := newTestManager(simpleShouldRetry, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) { retries++ }, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName) tr := NewRetryHTTPTransport(&MockTransport{ - handler: func(req *http.Request) (*http.Response, error) { + handler: func(_ *http.Request) (*http.Response, error) { attemptCount++ // Always return retryable error to test max retry limit return nil, errors.New("always fail") @@ -298,24 +299,24 @@ func TestResponseBodyDraining(t *testing.T) { } subgraphName := "sg" - mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + mgr := newTestManager(simpleShouldRetry, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) { actualRetries++ }, RetryOptions{MaxRetryCount: retryCount, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName) tr := NewRetryHTTPTransport(&MockTransport{ - handler: func(req *http.Request) (*http.Response, error) { + handler: func(_ *http.Request) (*http.Response, error) { index++ if index < retryCount { return &http.Response{ StatusCode: http.StatusInternalServerError, Body: bodies[index], }, nil - } else { - return &http.Response{ - StatusCode: http.StatusOK, - Body: bodies[index], - }, nil } + return &http.Response{ + StatusCode: http.StatusOK, + Body: bodies[index], + }, nil + }, }, loggerFunc, mgr, getActiveSubgraph(subgraphName)) @@ -364,24 +365,23 @@ func TestRequestLoggerIsUsed(t *testing.T) { } subgraphName := "sg" - mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + mgr := newTestManager(simpleShouldRetry, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) { actualRetries++ }, RetryOptions{MaxRetryCount: retryCount, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName) tr := NewRetryHTTPTransport(&MockTransport{ - handler: func(req *http.Request) (*http.Response, error) { + handler: func(_ *http.Request) (*http.Response, error) { index++ if index < retryCount { return &http.Response{ StatusCode: http.StatusInternalServerError, Body: bodies[index], }, nil - } else { - return &http.Response{ - StatusCode: http.StatusOK, - Body: bodies[index], - }, nil } + return &http.Response{ + StatusCode: http.StatusOK, + Body: bodies[index], + }, nil }, }, func(req *http.Request) *zap.Logger { return requestLogger }, mgr, getActiveSubgraph(subgraphName)) @@ -468,7 +468,7 @@ func TestRetryOn429WithDelaySeconds(t *testing.T) { var retryAfterUsed []bool subgraphName := `sg` - mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + mgr := newTestManager(shouldRetryWith429, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) { retries++ }, RetryOptions{MaxRetryCount: maxRetries, Interval: 100 * time.Millisecond, MaxDuration: 10 * time.Second}, subgraphName) @@ -481,7 +481,7 @@ func TestRetryOn429WithDelaySeconds(t *testing.T) { StatusCode: http.StatusTooManyRequests, Header: make(http.Header), } - resp.Header.Set("Retry-After", fmt.Sprintf("%d", retryAfterSeconds)) + resp.Header.Set("Retry-After", strconv.Itoa(retryAfterSeconds)) // Verify the header is parsed correctly duration, useRetryAfter := shouldUseRetryAfter(zap.NewNop(), resp, defaultMaxDuration) @@ -523,7 +523,7 @@ func TestRetryOn429WithDelaySecondsLargerThanMaxDuration(t *testing.T) { var retryAfterUsed []bool subgraphName := "sg" - mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + mgr := newTestManager(shouldRetryWith429, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) { retries++ }, RetryOptions{MaxRetryCount: maxRetries, Interval: 100 * time.Millisecond, MaxDuration: 10 * time.Second}, subgraphName) @@ -536,7 +536,7 @@ func TestRetryOn429WithDelaySecondsLargerThanMaxDuration(t *testing.T) { StatusCode: http.StatusTooManyRequests, Header: make(http.Header), } - resp.Header.Set("Retry-After", fmt.Sprintf("%d", retryAfterSeconds)) + resp.Header.Set("Retry-After", strconv.Itoa(retryAfterSeconds)) // Verify the header is parsed correctly duration, useRetryAfter := shouldUseRetryAfter(zap.NewNop(), resp, maxDuration) @@ -573,7 +573,7 @@ func TestRetryOn429WithoutRetryAfter(t *testing.T) { maxRetries := 2 subgraphName := "sg" - mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + mgr := newTestManager(shouldRetryWith429, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) { retries++ }, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName) @@ -614,7 +614,7 @@ func TestRetryOn429WithHTTPDate(t *testing.T) { var expectedDuration time.Duration subgraphName := "sg" - mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + mgr := newTestManager(shouldRetryWith429, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) { retries++ }, RetryOptions{MaxRetryCount: maxRetries, Interval: 100 * time.Millisecond, MaxDuration: 10 * time.Second}, subgraphName) @@ -668,7 +668,7 @@ func TestRetryOn429WithInvalidRetryAfterHeader(t *testing.T) { maxRetries := 2 subgraphName := "sg" - mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + mgr := newTestManager(shouldRetryWith429, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) { retries++ }, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName) @@ -708,7 +708,7 @@ func TestRetryOn429WithNegativeDelaySeconds(t *testing.T) { maxRetries := 2 subgraphName := "sg" - mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + mgr := newTestManager(shouldRetryWith429, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) { retries++ }, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName) @@ -750,7 +750,7 @@ func TestRetryMixed429AndOtherErrors(t *testing.T) { var retryAfterUsedPerAttempt []bool subgraphName := "sg" - mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + mgr := newTestManager(shouldRetryWith429, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) { retries++ }, RetryOptions{MaxRetryCount: maxRetries, Interval: 10 * time.Millisecond, MaxDuration: 10 * time.Second}, subgraphName) @@ -831,7 +831,7 @@ func TestNoRetryOn429WhenShouldRetryReturnsFalse(t *testing.T) { } subgraphName := `sg` - mgr := newTestManager(shouldNotRetry429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) { + mgr := newTestManager(shouldNotRetry429, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) { retries++ }, RetryOptions{MaxRetryCount: 3, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName) diff --git a/router/internal/traceclient/traceclient.go b/router/internal/traceclient/traceclient.go index 3207e3697f..53cd7236a8 100644 --- a/router/internal/traceclient/traceclient.go +++ b/router/internal/traceclient/traceclient.go @@ -34,14 +34,14 @@ type ClientTraceContextKey struct{} type TraceInjectingRoundTripper struct { base http.RoundTripper connectionMetricStore metric.ConnectionMetricStore - getExprContext func(ctx context.Context, req *http.Request) *expr.Context + getExprContext func(ctx context.Context) *expr.Context getActiveSubgraphName func(req *http.Request) string } func NewTraceInjectingRoundTripper( base http.RoundTripper, connectionMetricStore metric.ConnectionMetricStore, - getExprContext func(ctx context.Context, req *http.Request) *expr.Context, + getExprContext func(ctx context.Context) *expr.Context, getActiveSubgraphName func(req *http.Request) string, ) *TraceInjectingRoundTripper { return &TraceInjectingRoundTripper{ @@ -106,14 +106,14 @@ func (t *TraceInjectingRoundTripper) getClientTrace(ctx context.Context) *httptr func (t *TraceInjectingRoundTripper) processConnectionMetrics(ctx context.Context, req *http.Request) { trace := GetClientTraceFromContext(ctx) - exprContext := t.getExprContext(ctx, req) + exprContext := t.getExprContext(ctx) if trace.ConnectionGet != nil && trace.ConnectionAcquired != nil { duration := trace.ConnectionAcquired.Time.Sub(trace.ConnectionGet.Time) exprContext.Subgraph.Request.ClientTrace.ConnectionAcquireDuration = duration subgraph := t.getActiveSubgraphName(req) - + serverAttributes := rotel.GetServerAttributes(trace.ConnectionGet.HostPort) serverAttributes = append( serverAttributes, From 72c826870ee450233cb166e82255e3ddf0672c8d Mon Sep 17 00:00:00 2001 From: Milinda Dias Date: Thu, 18 Sep 2025 02:56:20 +0530 Subject: [PATCH 17/24] fix: ci tests --- router-tests/circuit_breaker_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/router-tests/circuit_breaker_test.go b/router-tests/circuit_breaker_test.go index b6d83f59b1..b7b0a8f4d1 100644 --- a/router-tests/circuit_breaker_test.go +++ b/router-tests/circuit_breaker_test.go @@ -724,7 +724,7 @@ func TestCircuitBreaker(t *testing.T) { require.Equal(t, failedTries, employeesCalls.Load()) // Wait for current bucket to be cleaned up - time.Sleep(breaker.RollingDuration + time.Millisecond*500) + time.Sleep(breaker.RollingDuration + time.Millisecond*2000) // ==== // Verify a success case with messages validated from here onwards From db1964c1fdaea1b24566aafe27aaa7feb5bf9328 Mon Sep 17 00:00:00 2001 From: Milinda Dias Date: Thu, 18 Sep 2025 03:10:19 +0530 Subject: [PATCH 18/24] fix: increase duration --- router-tests/circuit_breaker_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/router-tests/circuit_breaker_test.go b/router-tests/circuit_breaker_test.go index b7b0a8f4d1..ee5e4a4e73 100644 --- a/router-tests/circuit_breaker_test.go +++ b/router-tests/circuit_breaker_test.go @@ -724,7 +724,7 @@ func TestCircuitBreaker(t *testing.T) { require.Equal(t, failedTries, employeesCalls.Load()) // Wait for current bucket to be cleaned up - time.Sleep(breaker.RollingDuration + time.Millisecond*2000) + time.Sleep(breaker.RollingDuration*3 + time.Millisecond*1000) // ==== // Verify a success case with messages validated from here onwards From f37ac89baecf76e78fbc2f83a5f6ea34e9a8264c Mon Sep 17 00:00:00 2001 From: Milinda Dias Date: Tue, 7 Apr 2026 18:21:13 +0530 Subject: [PATCH 19/24] fix: gofmt --- router/internal/retrytransport/manager.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/router/internal/retrytransport/manager.go b/router/internal/retrytransport/manager.go index 0b140a4cca..576358937d 100644 --- a/router/internal/retrytransport/manager.go +++ b/router/internal/retrytransport/manager.go @@ -79,7 +79,7 @@ func (m *Manager) Initialize(baseRetryOptions RetryOptions, subgraphRetryOptions if baseRetryOptions.Algorithm != BackoffJitter { return fmt.Errorf("unsupported retry algorithm: %s", baseRetryOptions.Algorithm) } - + err := m.exprManager.AddExpression(baseRetryOptions.Expression) if err != nil { return fmt.Errorf("failed to add base retry expression: %w", err) From a4d460baa8f8e55f4af7a7a31269ca69097ed7fb Mon Sep 17 00:00:00 2001 From: Milinda Dias Date: Tue, 7 Apr 2026 19:19:10 +0530 Subject: [PATCH 20/24] fix: circuit breaker tests --- router-tests/security/circuit_breaker_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/router-tests/security/circuit_breaker_test.go b/router-tests/security/circuit_breaker_test.go index e521b6bd65..efbc59a7a2 100644 --- a/router-tests/security/circuit_breaker_test.go +++ b/router-tests/security/circuit_breaker_test.go @@ -641,8 +641,9 @@ func TestCircuitBreaker(t *testing.T) { breaker.NumBuckets = 1 breaker.RollingDuration = 5000 * time.Millisecond + breaker.ExecutionTimeout = 30 * time.Second - trafficConfig := getTrafficConfigWithTimeout(breaker, 1*time.Second) + trafficConfig := getTrafficConfigWithTimeout(breaker, 30*time.Second) employeesCalls := atomic.Int64{} From be2b7c90518b8296775d3ba4de7be1afec617e95 Mon Sep 17 00:00:00 2001 From: Milinda Dias Date: Tue, 7 Apr 2026 19:19:31 +0530 Subject: [PATCH 21/24] fix: tests --- router/internal/retrytransport/retry_transport.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/router/internal/retrytransport/retry_transport.go b/router/internal/retrytransport/retry_transport.go index 93746b3e2f..a2e3e69af6 100644 --- a/router/internal/retrytransport/retry_transport.go +++ b/router/internal/retrytransport/retry_transport.go @@ -170,6 +170,10 @@ func (rt *RetryHTTPTransport) drainBody(resp *http.Response, logger *zap.Logger) } func isResponseOK(resp *http.Response) bool { + // 101 Switching Protocols is a successful response for WebSocket upgrades + if resp.StatusCode == http.StatusSwitchingProtocols { + return true + } // Ensure we don't wait for no reason when subgraphs don't behave // spec-compliant and returns a different status code than 200. return resp.StatusCode >= 200 && resp.StatusCode < 300 From dfb90dd8acd762106f43101afe078be6c43d0fc6 Mon Sep 17 00:00:00 2001 From: Milinda Dias Date: Tue, 7 Apr 2026 20:12:39 +0530 Subject: [PATCH 22/24] fix: tests --- router-tests/security/circuit_breaker_test.go | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/router-tests/security/circuit_breaker_test.go b/router-tests/security/circuit_breaker_test.go index efbc59a7a2..c1b6d633bd 100644 --- a/router-tests/security/circuit_breaker_test.go +++ b/router-tests/security/circuit_breaker_test.go @@ -726,6 +726,11 @@ func TestCircuitBreaker(t *testing.T) { require.Equal(t, failedTries, employeesCalls.Load()) + // Ensure all previous subscriptions are fully cleaned up before + // waiting for the circuit to reset, to prevent leftover subscription + // cleanup from interfering with the half-open circuit state. + xEnv.WaitForSubscriptionCount(0, time.Second*5) + // Wait for current bucket to be cleaned up time.Sleep(breaker.RollingDuration*3 + time.Millisecond*1000) @@ -733,7 +738,6 @@ func TestCircuitBreaker(t *testing.T) { // Verify a success case with messages validated from here onwards // ==== - // Sending a complete must stop the subscription conn := xEnv.InitGraphQLWebSocketConnection(nil, nil, nil) err := testenv.WSWriteJSON(t, conn, &testenv.WebSocketMessage{ ID: "1", @@ -744,6 +748,12 @@ func TestCircuitBreaker(t *testing.T) { _, message, err := testenv.WSReadMessage(t, conn) require.NoError(t, err) + + t.Logf("employeesCalls after recovery: %d", employeesCalls.Load()) + t.Logf("received message: %s", string(message)) + t.Logf("circuit breaker status changed count: %d", xEnv.Observer().FilterMessage("Circuit breaker status changed").Len()) + t.Logf("circuit breaker open count: %d", xEnv.Observer().FilterMessage("Circuit breaker open, request callback did not execute").Len()) + require.JSONEq(t, timestampMessage, string(message)) err = testenv.WSWriteJSON(t, conn, &testenv.WebSocketMessage{ID: "1", Type: "complete"}) From 4f5265294b4988f533944a218282c9c36083b232 Mon Sep 17 00:00:00 2001 From: Milinda Dias Date: Tue, 21 Apr 2026 19:39:06 +0530 Subject: [PATCH 23/24] fix: changes --- router-tests/security/circuit_breaker_test.go | 14 ++++++-------- router-tests/security/retry_test.go | 5 +++++ router-tests/subscriptions/websocket_test.go | 13 ------------- 3 files changed, 11 insertions(+), 21 deletions(-) diff --git a/router-tests/security/circuit_breaker_test.go b/router-tests/security/circuit_breaker_test.go index c1b6d633bd..babbd19d70 100644 --- a/router-tests/security/circuit_breaker_test.go +++ b/router-tests/security/circuit_breaker_test.go @@ -641,9 +641,7 @@ func TestCircuitBreaker(t *testing.T) { breaker.NumBuckets = 1 breaker.RollingDuration = 5000 * time.Millisecond - breaker.ExecutionTimeout = 30 * time.Second - - trafficConfig := getTrafficConfigWithTimeout(breaker, 30*time.Second) + trafficConfig := getTrafficConfigWithTimeout(breaker, 1*time.Second) employeesCalls := atomic.Int64{} @@ -681,12 +679,17 @@ func TestCircuitBreaker(t *testing.T) { _ = conn.Close() }() + // Read connection_init _, _, err = testenv.WSReadMessage(t, conn) require.NoError(t, err) err = testenv.WSWriteMessage(t, conn, websocket.TextMessage, []byte(`{"type":"connection_ack"}`)) require.NoError(t, err) + // Read subscribe message before sending data + _, _, err = testenv.WSReadMessage(t, conn) + require.NoError(t, err) + err = testenv.WSWriteMessage(t, conn, websocket.TextMessage, []byte(timestampMessage)) require.NoError(t, err) @@ -749,11 +752,6 @@ func TestCircuitBreaker(t *testing.T) { _, message, err := testenv.WSReadMessage(t, conn) require.NoError(t, err) - t.Logf("employeesCalls after recovery: %d", employeesCalls.Load()) - t.Logf("received message: %s", string(message)) - t.Logf("circuit breaker status changed count: %d", xEnv.Observer().FilterMessage("Circuit breaker status changed").Len()) - t.Logf("circuit breaker open count: %d", xEnv.Observer().FilterMessage("Circuit breaker open, request callback did not execute").Len()) - require.JSONEq(t, timestampMessage, string(message)) err = testenv.WSWriteJSON(t, conn, &testenv.WebSocketMessage{ID: "1", Type: "complete"}) diff --git a/router-tests/security/retry_test.go b/router-tests/security/retry_test.go index a679a643db..2452d0d73d 100644 --- a/router-tests/security/retry_test.go +++ b/router-tests/security/retry_test.go @@ -1027,12 +1027,17 @@ func TestRetryPerSubgraph(t *testing.T) { _ = conn.Close() }() + // Read connection_init _, _, err = testenv.WSReadMessage(t, conn) require.NoError(t, err) err = testenv.WSWriteMessage(t, conn, websocket.TextMessage, []byte(`{"type":"connection_ack"}`)) require.NoError(t, err) + // Read subscribe message before sending data + _, _, err = testenv.WSReadMessage(t, conn) + require.NoError(t, err) + err = testenv.WSWriteMessage(t, conn, websocket.TextMessage, []byte(timestampMessage)) require.NoError(t, err) diff --git a/router-tests/subscriptions/websocket_test.go b/router-tests/subscriptions/websocket_test.go index 029e544b88..981d72fe2c 100644 --- a/router-tests/subscriptions/websocket_test.go +++ b/router-tests/subscriptions/websocket_test.go @@ -741,18 +741,6 @@ func TestWebSockets(t *testing.T) { t.Run("empty allow lists should allow all headers and query args", func(t *testing.T) { t.Parallel() - opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{ - All: config.GlobalSubgraphRequestRule{ - BackoffJitterRetry: config.BackoffJitterRetry{ - Enabled: true, - MaxAttempts: 5, - MaxDuration: 10 * time.Second, - Interval: 200 * time.Millisecond, - Expression: "true", - }, - }, - }) - headerRules := config.HeaderRules{ All: &config.GlobalHeaderRule{ Request: []*config.RequestHeaderRule{ @@ -781,7 +769,6 @@ func TestWebSockets(t *testing.T) { }, RouterOptions: []core.Option{ core.WithHeaderRules(headerRules), - core.WithSubgraphRetryOptions(opts), }, Subgraphs: testenv.SubgraphsConfig{ Employees: testenv.SubgraphConfig{ From 65b1db84e08d1d72257460595b03e87bc743a5ba Mon Sep 17 00:00:00 2001 From: Milinda Dias Date: Tue, 21 Apr 2026 20:42:18 +0530 Subject: [PATCH 24/24] fix: retry updates --- router-tests/security/retry_test.go | 3 +- router/core/graph_server.go | 13 +--- router/core/retry_builder.go | 4 +- router/core/retry_builder_test.go | 45 +++++-------- router/internal/retrytransport/manager.go | 44 +++++++------ .../retrytransport/retry_transport.go | 4 +- .../retrytransport/retry_transport_test.go | 63 ++++++++++++++++++- 7 files changed, 107 insertions(+), 69 deletions(-) diff --git a/router-tests/security/retry_test.go b/router-tests/security/retry_test.go index 2452d0d73d..7939240891 100644 --- a/router-tests/security/retry_test.go +++ b/router-tests/security/retry_test.go @@ -928,12 +928,11 @@ func TestRetryPerSubgraph(t *testing.T) { }) }) - t.Run("verify retries are applied when only all and no subgraph specific overrides are present", func(t *testing.T) { + t.Run("verify retries are applied on feature flags when only all and no subgraph specific overrides are present", func(t *testing.T) { t.Parallel() calls := atomic.Int32{} - // Configure per-subgraph retry: employees gets 3 retries, test1 gets 1 retry generalMax := 5 opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{ All: config.GlobalSubgraphRequestRule{ diff --git a/router/core/graph_server.go b/router/core/graph_server.go index d7634bdc34..8b313c17aa 100644 --- a/router/core/graph_server.go +++ b/router/core/graph_server.go @@ -298,20 +298,13 @@ func newGraphServer(ctx context.Context, r *Router, routerConfig *nodev1.RouterC if s.retryOptions.IsEnabled() { retryExprManager := expr.NewRetryExpressionManager() - // Build retry options and handle any expression compilation errors - shouldRetryFunc, err := BuildRetryFunction(retryExprManager) - if err != nil { - return nil, fmt.Errorf("failed to build retry function: %w", err) - } - - retryManager := retrytransport.NewManager(retryExprManager, shouldRetryFunc, s.retryOptions.OnRetryFunc) + retryManager := retrytransport.NewManager(retryExprManager, BuildRetryFunction(retryExprManager), s.retryOptions.OnRetryFunc, s.logger) - err = retryManager.Initialize( + if err := retryManager.Initialize( s.retryOptions.All, s.retryOptions.SubgraphMap, routerConfig, - ) - if err != nil { + ); err != nil { return nil, err } diff --git a/router/core/retry_builder.go b/router/core/retry_builder.go index e563b58d3a..00ce7443ca 100644 --- a/router/core/retry_builder.go +++ b/router/core/retry_builder.go @@ -10,7 +10,7 @@ import ( ) // BuildRetryFunction creates a ShouldRetry function based on the provided expression -func BuildRetryFunction(manager *expr.RetryExpressionManager) (retrytransport.ShouldRetryFunc, error) { +func BuildRetryFunction(manager *expr.RetryExpressionManager) retrytransport.ShouldRetryFunc { return func(err error, req *http.Request, resp *http.Response, expression string) bool { reqContext := getRequestContext(req.Context()) if reqContext == nil { @@ -42,7 +42,7 @@ func BuildRetryFunction(manager *expr.RetryExpressionManager) (retrytransport.Sh } return shouldRetry - }, nil + } } // isDefaultRetryableError checks for errors that should always be retryable diff --git a/router/core/retry_builder_test.go b/router/core/retry_builder_test.go index 7bfeda9fcd..ecf5b4662f 100644 --- a/router/core/retry_builder_test.go +++ b/router/core/retry_builder_test.go @@ -51,8 +51,7 @@ func createRequestWithContext(opType string) (*http.Request, *requestContext) { func TestBuildRetryFunction(t *testing.T) { t.Run("build function when retry is disabled", func(t *testing.T) { manager := expr.NewRetryExpressionManager() - fn, err := BuildRetryFunction(manager) - assert.NoError(t, err) + fn := BuildRetryFunction(manager) assert.NotNil(t, fn) }) @@ -62,8 +61,7 @@ func TestBuildRetryFunction(t *testing.T) { err := manager.AddExpression("") assert.NoError(t, err) - fn, err := BuildRetryFunction(manager) - assert.NoError(t, err) + fn := BuildRetryFunction(manager) assert.NotNil(t, fn) // Create a request with proper query context @@ -90,8 +88,7 @@ func TestBuildRetryFunction(t *testing.T) { err := manager.AddExpression(expression) assert.NoError(t, err) - fn, err := BuildRetryFunction(manager) - assert.NoError(t, err) + fn := BuildRetryFunction(manager) assert.NotNil(t, fn) // Create a request with proper query context @@ -116,8 +113,7 @@ func TestBuildRetryFunction(t *testing.T) { err := manager.AddExpression(expression) assert.NoError(t, err) - fn, err := BuildRetryFunction(manager) - assert.NoError(t, err) + fn := BuildRetryFunction(manager) assert.NotNil(t, fn) // Create a request with proper query context @@ -149,8 +145,7 @@ func TestBuildRetryFunction(t *testing.T) { err := manager.AddExpression("") assert.NoError(t, err) - fn, err := BuildRetryFunction(manager) - assert.NoError(t, err) + fn := BuildRetryFunction(manager) assert.NotNil(t, fn) // Create a request with proper query context @@ -179,8 +174,7 @@ func TestBuildRetryFunction(t *testing.T) { err := manager.AddExpression(expression) assert.NoError(t, err) - fn, err := BuildRetryFunction(manager) - assert.NoError(t, err) + fn := BuildRetryFunction(manager) assert.NotNil(t, fn) // Create a request with proper query context @@ -195,8 +189,7 @@ func TestBuildRetryFunction(t *testing.T) { err := manager.AddExpression(expression) assert.NoError(t, err) - fn, err := BuildRetryFunction(manager) - assert.NoError(t, err) + fn := BuildRetryFunction(manager) assert.NotNil(t, fn) // Create a request with proper query context @@ -217,8 +210,7 @@ func TestBuildRetryFunction(t *testing.T) { err := manager.AddExpression(expression) assert.NoError(t, err) - fn, err := BuildRetryFunction(manager) - assert.NoError(t, err) + fn := BuildRetryFunction(manager) assert.NotNil(t, fn) // Create a request with proper query context @@ -245,8 +237,7 @@ func TestBuildRetryFunction(t *testing.T) { err := manager.AddExpression(expression) assert.NoError(t, err) - fn, err := BuildRetryFunction(manager) - assert.NoError(t, err) + fn := BuildRetryFunction(manager) assert.NotNil(t, fn) // Create a request with mutation context @@ -275,8 +266,7 @@ func TestBuildRetryFunction(t *testing.T) { err := manager.AddExpression(expression) assert.NoError(t, err) - fn, err := BuildRetryFunction(manager) - assert.NoError(t, err) + fn := BuildRetryFunction(manager) assert.NotNil(t, fn) // Create a request with query context @@ -300,8 +290,7 @@ func TestBuildRetryFunction(t *testing.T) { err := manager.AddExpression(expression) assert.NoError(t, err) - fn, err := BuildRetryFunction(manager) - assert.NoError(t, err) + fn := BuildRetryFunction(manager) assert.NotNil(t, fn) // Create a request with subscription context @@ -323,8 +312,7 @@ func TestBuildRetryFunction(t *testing.T) { err := manager.AddExpression(expression) assert.NoError(t, err) - fn, err := BuildRetryFunction(manager) - assert.NoError(t, err) + fn := BuildRetryFunction(manager) assert.NotNil(t, fn) // Create request with proper context @@ -344,8 +332,7 @@ func TestBuildRetryFunction(t *testing.T) { err := manager.AddExpression(expression) assert.NoError(t, err) - fn, err := BuildRetryFunction(manager) - assert.NoError(t, err) + fn := BuildRetryFunction(manager) assert.NotNil(t, fn) // Create request with proper query context @@ -366,8 +353,7 @@ func TestBuildRetryFunction(t *testing.T) { err := manager.AddExpression(expression) assert.NoError(t, err) - fn, err := BuildRetryFunction(manager) - assert.NoError(t, err) + fn := BuildRetryFunction(manager) assert.NotNil(t, fn) // Create a request with mutation context @@ -394,8 +380,7 @@ func TestBuildRetryFunction(t *testing.T) { err := manager.AddExpression(expression) assert.NoError(t, err) - fn, err := BuildRetryFunction(manager) - assert.NoError(t, err) + fn := BuildRetryFunction(manager) assert.NotNil(t, fn) // Test query operation - should retry on all conditions diff --git a/router/internal/retrytransport/manager.go b/router/internal/retrytransport/manager.go index 576358937d..0271a39706 100644 --- a/router/internal/retrytransport/manager.go +++ b/router/internal/retrytransport/manager.go @@ -3,7 +3,6 @@ package retrytransport import ( "fmt" "net/http" - "sync" "time" nodev1 "github.com/wundergraph/cosmo/router/gen/proto/wg/cosmo/node/v1" @@ -32,18 +31,22 @@ type RetryOptions struct { type Manager struct { retries map[string]*RetryOptions - lock sync.RWMutex exprManager *expr.RetryExpressionManager retryFunc ShouldRetryFunc - OnRetry OnRetryFunc + onRetry OnRetryFunc + logger *zap.Logger } -func NewManager(exprManager *expr.RetryExpressionManager, retryFunc ShouldRetryFunc, onRetryFunc OnRetryFunc) *Manager { +func NewManager(exprManager *expr.RetryExpressionManager, retryFunc ShouldRetryFunc, onRetryFunc OnRetryFunc, logger *zap.Logger) *Manager { + if logger == nil { + logger = zap.NewNop() + } return &Manager{ retries: make(map[string]*RetryOptions), exprManager: exprManager, retryFunc: retryFunc, - OnRetry: onRetryFunc, + onRetry: onRetryFunc, + logger: logger, } } @@ -61,6 +64,17 @@ func (m *Manager) Initialize(baseRetryOptions RetryOptions, subgraphRetryOptions } } + // Warn on retry configs pointing at subgraphs that don't exist in the + // router config — likely a typo that would otherwise silently disable + // the override. + for sgName := range subgraphRetryOptions { + if !subgraphNameSet[sgName] { + m.logger.Warn("Retry config references unknown subgraph; override will be ignored", + zap.String("subgraph_name", sgName), + ) + } + } + defaultSgNames := make([]string, 0, len(subgraphNameSet)) customSgNames := make([]string, 0, len(subgraphNameSet)) @@ -80,6 +94,8 @@ func (m *Manager) Initialize(baseRetryOptions RetryOptions, subgraphRetryOptions return fmt.Errorf("unsupported retry algorithm: %s", baseRetryOptions.Algorithm) } + // There is a chance that this is not evaluated if all defaultSgNames == 0, and only will + // then error out when its > 0 there err := m.exprManager.AddExpression(baseRetryOptions.Expression) if err != nil { return fmt.Errorf("failed to add base retry expression: %w", err) @@ -89,18 +105,17 @@ func (m *Manager) Initialize(baseRetryOptions RetryOptions, subgraphRetryOptions opts := baseRetryOptions m.retries[sgName] = &opts } - } // Process custom retry options for _, sgName := range customSgNames { entry, ok := subgraphRetryOptions[sgName] if !ok { - return fmt.Errorf("failed to add base retry expression: %s", sgName) + return fmt.Errorf("failed to get subgraphRetryOptions: %s", sgName) } if entry.Algorithm != BackoffJitter { - return fmt.Errorf("unsupported retry algorithm for subgraph %s: %s", sgName, baseRetryOptions.Algorithm) + return fmt.Errorf("unsupported retry algorithm for subgraph %s: %s", sgName, entry.Algorithm) } // Validate expression before assigning options @@ -121,24 +136,13 @@ func (m *Manager) GetSubgraphOptions(name string) *RetryOptions { if m == nil { return nil } - - m.lock.RLock() - defer m.lock.RUnlock() - - if retryOptions, ok := m.retries[name]; ok { - return retryOptions - } - return nil + return m.retries[name] } func (m *Manager) IsEnabled() bool { if m == nil { return false } - - m.lock.RLock() - defer m.lock.RUnlock() - return len(m.retries) > 0 } diff --git a/router/internal/retrytransport/retry_transport.go b/router/internal/retrytransport/retry_transport.go index a2e3e69af6..9d9e452497 100644 --- a/router/internal/retrytransport/retry_transport.go +++ b/router/internal/retrytransport/retry_transport.go @@ -126,8 +126,8 @@ func (rt *RetryHTTPTransport) RoundTrip(req *http.Request) (*http.Response, erro } // A hook used for testing - if rt.retryManager.OnRetry != nil { - rt.retryManager.OnRetry(retries, req, resp, sleepDuration, err) + if rt.retryManager.onRetry != nil { + rt.retryManager.onRetry(retries, req, resp, sleepDuration, err) } // Wait for the specified duration diff --git a/router/internal/retrytransport/retry_transport_test.go b/router/internal/retrytransport/retry_transport_test.go index 81877be9af..41328c37ce 100644 --- a/router/internal/retrytransport/retry_transport_test.go +++ b/router/internal/retrytransport/retry_transport_test.go @@ -13,10 +13,13 @@ import ( "time" "go.uber.org/zap/zapcore" + "go.uber.org/zap/zaptest/observer" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "go.uber.org/zap" + nodev1 "github.com/wundergraph/cosmo/router/gen/proto/wg/cosmo/node/v1" "github.com/wundergraph/cosmo/router/internal/expr" ) @@ -65,7 +68,7 @@ func (dt *MockTransport) RoundTrip(req *http.Request) (*http.Response, error) { func newTestManager(shouldRetry func(error, *http.Request, *http.Response) bool, onRetry OnRetryFunc, opts RetryOptions, subgraphName string) *Manager { mgr := NewManager(expr.NewRetryExpressionManager(), func(err error, req *http.Request, resp *http.Response, _ string) bool { return shouldRetry(err, req, resp) - }, onRetry) + }, onRetry, zap.NewNop()) // attach options for the subgraph mgr.retries[subgraphName] = &opts return mgr @@ -227,7 +230,7 @@ func TestShortCircuitOnSuccess(t *testing.T) { subgraphName := "sg" mgr := newTestManager(simpleShouldRetry, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) { - t.Error("OnRetry should not be called when first request succeeds") + t.Error("onRetry should not be called when first request succeeds") }, RetryOptions{MaxRetryCount: 5, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName) tr := NewRetryHTTPTransport(&MockTransport{ @@ -445,7 +448,7 @@ func TestOnRetryCallbackInvoked(t *testing.T) { assert.NoError(t, err) assert.Equal(t, http.StatusOK, resp.StatusCode) - // Verify OnRetry was called the right number of times + // Verify onRetry was called the right number of times assert.Equal(t, maxRetries, retries) assert.Len(t, retryCallbacks, maxRetries) @@ -1015,3 +1018,57 @@ func TestShouldUseRetryAfter(t *testing.T) { }) } } + +func TestManager_Initialize_WarnsOnUnknownSubgraph(t *testing.T) { + routerConfig := &nodev1.RouterConfig{ + Subgraphs: []*nodev1.Subgraph{ + {Id: "known-id", Name: "known", RoutingUrl: "http://localhost:8001/graphql"}, + }, + } + + t.Run("warns for each retry config whose subgraph is not in the router config", func(t *testing.T) { + core, logs := observer.New(zapcore.WarnLevel) + mgr := NewManager(expr.NewRetryExpressionManager(), nil, nil, zap.New(core)) + + err := mgr.Initialize( + RetryOptions{}, + map[string]RetryOptions{ + "known": {Enabled: true, Algorithm: BackoffJitter, Expression: "true"}, + "typo_name": {Enabled: true, Algorithm: BackoffJitter, Expression: "true"}, + "other_typo": {Enabled: false}, + }, + routerConfig, + ) + require.NoError(t, err) + + warnings := logs.FilterMessage("Retry config references unknown subgraph; override will be ignored").All() + require.Len(t, warnings, 2) + + got := make(map[string]bool, len(warnings)) + for _, entry := range warnings { + got[entry.ContextMap()["subgraph_name"].(string)] = true + } + require.True(t, got["typo_name"]) + require.True(t, got["other_typo"]) + + // Unknown-named overrides must not leak into the retries map + require.Nil(t, mgr.GetSubgraphOptions("typo_name")) + require.Nil(t, mgr.GetSubgraphOptions("other_typo")) + require.NotNil(t, mgr.GetSubgraphOptions("known")) + }) + + t.Run("does not warn when every retry config matches a known subgraph", func(t *testing.T) { + core, logs := observer.New(zapcore.WarnLevel) + mgr := NewManager(expr.NewRetryExpressionManager(), nil, nil, zap.New(core)) + + err := mgr.Initialize( + RetryOptions{}, + map[string]RetryOptions{ + "known": {Enabled: true, Algorithm: BackoffJitter, Expression: "true"}, + }, + routerConfig, + ) + require.NoError(t, err) + require.Zero(t, logs.FilterMessage("Retry config references unknown subgraph; override will be ignored").Len()) + }) +}