From d86e5cced7f5f56f4e65fca95d4709f8905a03f2 Mon Sep 17 00:00:00 2001
From: Milinda Dias <skay.archon@gmail.com>
Date: Mon, 1 Sep 2025 18:23:41 +0530
Subject: [PATCH 01/24] feat: retry settings config per subgraph

---
 router-tests/error_handling_test.go           |  24 +-
 router-tests/panic_test.go                    |  12 +-
 router-tests/retry_test.go                    | 343 +++++++++++++-
 router-tests/structured_logging_test.go       |  10 +-
 router-tests/telemetry/telemetry_test.go      |  28 +-
 router/core/graph_server.go                   |  32 +-
 router/core/retry_builder.go                  |  71 +--
 router/core/retry_builder_test.go             | 343 +++++++-------
 router/core/router.go                         |  73 ++-
 router/core/router_config.go                  |   5 +-
 router/core/supervisor_instance.go            |  10 +-
 router/core/transport.go                      |  14 +-
 router/internal/expr/retry_expression.go      |  48 +-
 router/internal/expr/retry_expression_test.go |  32 +-
 router/internal/retrytransport/manager.go     | 144 ++++++
 .../retrytransport/retry_transport.go         |  60 ++-
 .../retrytransport/retry_transport_test.go    | 444 ++++++++----------
 17 files changed, 1046 insertions(+), 647 deletions(-)
 create mode 100644 router/internal/retrytransport/manager.go

diff --git a/router-tests/error_handling_test.go b/router-tests/error_handling_test.go
index 394b712492..5a7f69febe 100644
--- a/router-tests/error_handling_test.go
+++ b/router-tests/error_handling_test.go
@@ -1380,7 +1380,11 @@ func TestErrorPropagation(t *testing.T) {
 					EnableSingleFlight:     true,
 					MaxConcurrentResolvers: 1,
 				}),
-				core.WithSubgraphRetryOptions(false, "", 0, 0, 0, "", nil),
+				core.WithSubgraphRetryOptions(core.NewSubgraphRetryOptions(config.TrafficShapingRules{
+					All: config.GlobalSubgraphRequestRule{
+						BackoffJitterRetry: config.BackoffJitterRetry{Enabled: false},
+					},
+				})),
 			},
 		}, func(t *testing.T, xEnv *testenv.Environment) {
 			resp, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{
@@ -1412,7 +1416,11 @@ func TestErrorPropagation(t *testing.T) {
 					EnableSingleFlight:     true,
 					MaxConcurrentResolvers: 1,
 				}),
-				core.WithSubgraphRetryOptions(false, "", 0, 0, 0, "", nil),
+				core.WithSubgraphRetryOptions(core.NewSubgraphRetryOptions(config.TrafficShapingRules{
+					All: config.GlobalSubgraphRequestRule{
+						BackoffJitterRetry: config.BackoffJitterRetry{Enabled: false},
+					},
+				})),
 			},
 		}, func(t *testing.T, xEnv *testenv.Environment) {
 			resp, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{
@@ -1444,7 +1452,11 @@ func TestErrorPropagation(t *testing.T) {
 					EnableSingleFlight:     true,
 					MaxConcurrentResolvers: 1,
 				}),
-				core.WithSubgraphRetryOptions(false, "", 0, 0, 0, "", nil),
+				core.WithSubgraphRetryOptions(core.NewSubgraphRetryOptions(config.TrafficShapingRules{
+					All: config.GlobalSubgraphRequestRule{
+						BackoffJitterRetry: config.BackoffJitterRetry{Enabled: false},
+					},
+				})),
 			},
 		}, func(t *testing.T, xEnv *testenv.Environment) {
 			resp, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{
@@ -1476,7 +1488,11 @@ func TestErrorPropagation(t *testing.T) {
 					EnableSingleFlight:     true,
 					MaxConcurrentResolvers: 1,
 				}),
-				core.WithSubgraphRetryOptions(false, "", 0, 0, 0, "", nil),
+				core.WithSubgraphRetryOptions(core.NewSubgraphRetryOptions(config.TrafficShapingRules{
+					All: config.GlobalSubgraphRequestRule{
+						BackoffJitterRetry: config.BackoffJitterRetry{Enabled: false},
+					},
+				})),
 			},
 		}, func(t *testing.T, xEnv *testenv.Environment) {
 			resp, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{
diff --git a/router-tests/panic_test.go b/router-tests/panic_test.go
index bb12614f44..eeca055597 100644
--- a/router-tests/panic_test.go
+++ b/router-tests/panic_test.go
@@ -48,7 +48,11 @@ func TestEnginePanic(t *testing.T) {
 					EnableSingleFlight:     true,
 					MaxConcurrentResolvers: 1,
 				}),
-				core.WithSubgraphRetryOptions(false, "", 0, 0, 0, "", nil),
+				core.WithSubgraphRetryOptions(core.NewSubgraphRetryOptions(config.TrafficShapingRules{
+					All: config.GlobalSubgraphRequestRule{
+						BackoffJitterRetry: config.BackoffJitterRetry{Enabled: false},
+					},
+				})),
 			},
 		}, func(t *testing.T, xEnv *testenv.Environment) {
 			res, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{
@@ -80,7 +84,11 @@ func TestEnginePanic(t *testing.T) {
 					EnableSingleFlight: true,
 					ParseKitPoolSize:   1,
 				}),
-				core.WithSubgraphRetryOptions(false, "", 0, 0, 0, "", nil),
+				core.WithSubgraphRetryOptions(core.NewSubgraphRetryOptions(config.TrafficShapingRules{
+					All: config.GlobalSubgraphRequestRule{
+						BackoffJitterRetry: config.BackoffJitterRetry{Enabled: false},
+					},
+				})),
 			},
 		}, func(t *testing.T, xEnv *testenv.Environment) {
 			res, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{
diff --git a/router-tests/retry_test.go b/router-tests/retry_test.go
index cf916e15bc..010b3ab1d8 100644
--- a/router-tests/retry_test.go
+++ b/router-tests/retry_test.go
@@ -1,15 +1,16 @@
 package integration
 
 import (
-	"github.com/stretchr/testify/require"
-	"github.com/wundergraph/cosmo/router-tests/testenv"
-	"github.com/wundergraph/cosmo/router/core"
-	"github.com/wundergraph/cosmo/router/pkg/config"
 	"net/http"
 	"strconv"
 	"sync/atomic"
 	"testing"
 	"time"
+
+	"github.com/stretchr/testify/require"
+	"github.com/wundergraph/cosmo/router-tests/testenv"
+	"github.com/wundergraph/cosmo/router/core"
+	"github.com/wundergraph/cosmo/router/pkg/config"
 )
 
 func CreateRetryCounterFunc(counter *atomic.Int32, duration *atomic.Int64) func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
@@ -34,7 +35,19 @@ func TestRetry(t *testing.T) {
 		maxRetryCount := 3
 		expression := "true"
 
-		options := core.WithSubgraphRetryOptions(true, "", maxRetryCount, 10*time.Second, 200*time.Millisecond, expression, retryCounterFunc)
+		opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{
+			All: config.GlobalSubgraphRequestRule{
+				BackoffJitterRetry: config.BackoffJitterRetry{
+					Enabled:     true,
+					MaxAttempts: maxRetryCount,
+					MaxDuration: 10 * time.Second,
+					Interval:    200 * time.Millisecond,
+					Expression:  expression,
+				},
+			},
+		})
+		opts.OnRetryFunc = retryCounterFunc
+		options := core.WithSubgraphRetryOptions(opts)
 
 		testenv.Run(t, &testenv.Config{
 			NoRetryClient:   true,
@@ -76,7 +89,19 @@ func TestRetry(t *testing.T) {
 		maxRetryCount := 3
 		expression := "false"
 
-		options := core.WithSubgraphRetryOptions(true, "", maxRetryCount, 10*time.Second, 200*time.Millisecond, expression, retryCounterFunc)
+		opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{
+			All: config.GlobalSubgraphRequestRule{
+				BackoffJitterRetry: config.BackoffJitterRetry{
+					Enabled:     true,
+					MaxAttempts: maxRetryCount,
+					MaxDuration: 10 * time.Second,
+					Interval:    200 * time.Millisecond,
+					Expression:  expression,
+				},
+			},
+		})
+		opts.OnRetryFunc = retryCounterFunc
+		options := core.WithSubgraphRetryOptions(opts)
 
 		testenv.Run(t, &testenv.Config{
 			NoRetryClient:   true,
@@ -117,7 +142,19 @@ func TestRetry(t *testing.T) {
 		maxRetryCount := 3
 		expression := "true"
 
-		options := core.WithSubgraphRetryOptions(true, "", maxRetryCount, 10*time.Second, 200*time.Millisecond, expression, retryCounterFunc)
+		opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{
+			All: config.GlobalSubgraphRequestRule{
+				BackoffJitterRetry: config.BackoffJitterRetry{
+					Enabled:     true,
+					MaxAttempts: maxRetryCount,
+					MaxDuration: 10 * time.Second,
+					Interval:    200 * time.Millisecond,
+					Expression:  expression,
+				},
+			},
+		})
+		opts.OnRetryFunc = retryCounterFunc
+		options := core.WithSubgraphRetryOptions(opts)
 
 		testenv.Run(t, &testenv.Config{
 			NoRetryClient:   true,
@@ -159,7 +196,19 @@ func TestRetry(t *testing.T) {
 		maxAttemptsBeforeServiceSucceeds := 2
 		expression := "true"
 
-		options := core.WithSubgraphRetryOptions(true, "", maxRetryCount, 10*time.Second, 200*time.Millisecond, expression, retryCounterFunc)
+		opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{
+			All: config.GlobalSubgraphRequestRule{
+				BackoffJitterRetry: config.BackoffJitterRetry{
+					Enabled:     true,
+					MaxAttempts: maxRetryCount,
+					MaxDuration: 10 * time.Second,
+					Interval:    200 * time.Millisecond,
+					Expression:  expression,
+				},
+			},
+		})
+		opts.OnRetryFunc = retryCounterFunc
+		options := core.WithSubgraphRetryOptions(opts)
 
 		testenv.Run(t, &testenv.Config{
 			NoRetryClient:   true,
@@ -208,7 +257,19 @@ func TestRetry(t *testing.T) {
 		expression := "statusCode == 429"
 		headerRetryIntervalInSeconds := 1
 
-		options := core.WithSubgraphRetryOptions(true, "", maxRetryCount, 2000*time.Second, 100*time.Millisecond, expression, retryCounterFunc)
+		opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{
+			All: config.GlobalSubgraphRequestRule{
+				BackoffJitterRetry: config.BackoffJitterRetry{
+					Enabled:     true,
+					MaxAttempts: maxRetryCount,
+					MaxDuration: 2000 * time.Second,
+					Interval:    100 * time.Millisecond,
+					Expression:  expression,
+				},
+			},
+		})
+		opts.OnRetryFunc = retryCounterFunc
+		options := core.WithSubgraphRetryOptions(opts)
 
 		testenv.Run(t, &testenv.Config{
 			NoRetryClient:   true,
@@ -260,7 +321,19 @@ func TestFlakyRetry(t *testing.T) {
 		maxDuration := 100 * time.Millisecond
 		expression := "true"
 
-		options := core.WithSubgraphRetryOptions(true, "", maxRetryCount, maxDuration, retryInterval, expression, retryCounterFunc)
+		opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{
+			All: config.GlobalSubgraphRequestRule{
+				BackoffJitterRetry: config.BackoffJitterRetry{
+					Enabled:     true,
+					MaxAttempts: maxRetryCount,
+					MaxDuration: maxDuration,
+					Interval:    retryInterval,
+					Expression:  expression,
+				},
+			},
+		})
+		opts.OnRetryFunc = retryCounterFunc
+		options := core.WithSubgraphRetryOptions(opts)
 
 		testenv.Run(t, &testenv.Config{
 			NoRetryClient:   true,
@@ -315,7 +388,19 @@ func TestFlakyRetry(t *testing.T) {
 			maxRetryCount := 3
 			expression := "statusCode == 429"
 
-			options := core.WithSubgraphRetryOptions(true, "", maxRetryCount, 1000*time.Millisecond, retryInterval, expression, retryCounterFunc)
+			opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{
+				All: config.GlobalSubgraphRequestRule{
+					BackoffJitterRetry: config.BackoffJitterRetry{
+						Enabled:     true,
+						MaxAttempts: maxRetryCount,
+						MaxDuration: 1000 * time.Millisecond,
+						Interval:    retryInterval,
+						Expression:  expression,
+					},
+				},
+			})
+			opts.OnRetryFunc = retryCounterFunc
+			options := core.WithSubgraphRetryOptions(opts)
 
 			testenv.Run(t, &testenv.Config{
 				NoRetryClient:   true,
@@ -363,7 +448,19 @@ func TestFlakyRetry(t *testing.T) {
 			emptyRetryInterval := 0
 			retryInterval := 300 * time.Millisecond
 
-			options := core.WithSubgraphRetryOptions(true, "", maxRetryCount, 1000*time.Millisecond, retryInterval, expression, retryCounterFunc)
+			opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{
+				All: config.GlobalSubgraphRequestRule{
+					BackoffJitterRetry: config.BackoffJitterRetry{
+						Enabled:     true,
+						MaxAttempts: maxRetryCount,
+						MaxDuration: 1000 * time.Millisecond,
+						Interval:    retryInterval,
+						Expression:  expression,
+					},
+				},
+			})
+			opts.OnRetryFunc = retryCounterFunc
+			options := core.WithSubgraphRetryOptions(opts)
 
 			testenv.Run(t, &testenv.Config{
 				NoRetryClient:   true,
@@ -399,3 +496,225 @@ func TestFlakyRetry(t *testing.T) {
 		})
 	})
 }
+
+func TestRetryPerSubgraph(t *testing.T) {
+	t.Parallel()
+
+	t.Run("verify invalid algorithm is detected for base", func(t *testing.T) {
+		t.Parallel()
+
+		// Configure per-subgraph retry: employees gets 3 retries, test1 gets 1 retry
+		opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{
+			All: config.GlobalSubgraphRequestRule{
+				BackoffJitterRetry: config.BackoffJitterRetry{
+					Enabled:     true,
+					Algorithm:   "invalid_algorithm",
+					MaxAttempts: 2,
+					MaxDuration: 2 * time.Second,
+					Interval:    10 * time.Millisecond,
+					Expression:  "true",
+				},
+			},
+		})
+		options := core.WithSubgraphRetryOptions(opts)
+
+		err := testenv.RunWithError(t, &testenv.Config{
+			NoRetryClient: true,
+			RouterOptions: []core.Option{
+				options,
+			},
+		}, func(t *testing.T, xEnv *testenv.Environment) {
+			require.Fail(t, "expected initialization to fail due to invalid algorithm")
+		})
+
+		require.ErrorContains(t, err, "unsupported retry algorithm")
+	})
+
+	t.Run("verify invalid algorithm is detected for per subgraphs", func(t *testing.T) {
+		t.Parallel()
+
+		// Configure per-subgraph retry: employees gets 3 retries, test1 gets 1 retry
+		opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{
+			Subgraphs: map[string]config.GlobalSubgraphRequestRule{
+				"employees": {
+					BackoffJitterRetry: config.BackoffJitterRetry{
+						Enabled:     true,
+						Algorithm:   "invalid_algorithm",
+						MaxAttempts: 2,
+						MaxDuration: 2 * time.Second,
+						Interval:    10 * time.Millisecond,
+						Expression:  "true",
+					},
+				},
+			},
+		})
+		options := core.WithSubgraphRetryOptions(opts)
+
+		err := testenv.RunWithError(t, &testenv.Config{
+			NoRetryClient: true,
+			RouterOptions: []core.Option{
+				options,
+			},
+		}, func(t *testing.T, xEnv *testenv.Environment) {
+			require.Fail(t, "expected initialization to fail due to invalid algorithm")
+		})
+
+		require.ErrorContains(t, err, "unsupported retry algorithm")
+	})
+
+	t.Run("verify retries are applied per subgraph", func(t *testing.T) {
+		t.Parallel()
+
+		employeesCalls := atomic.Int32{}
+		test1Calls := atomic.Int32{}
+
+		// Configure per-subgraph retry: employees gets 3 retries, test1 gets 1 retry
+		employeesMax := 3
+		test1Max := 1
+		opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{
+			Subgraphs: map[string]config.GlobalSubgraphRequestRule{
+				"employees": {
+					BackoffJitterRetry: config.BackoffJitterRetry{
+						Enabled:     true,
+						MaxAttempts: employeesMax,
+						MaxDuration: 2 * time.Second,
+						Interval:    10 * time.Millisecond,
+						Expression:  "true",
+					},
+				},
+				"test1": {
+					BackoffJitterRetry: config.BackoffJitterRetry{
+						Enabled:     true,
+						MaxAttempts: test1Max,
+						MaxDuration: 2 * time.Second,
+						Interval:    10 * time.Millisecond,
+						Expression:  "true",
+					},
+				},
+			},
+		})
+		options := core.WithSubgraphRetryOptions(opts)
+
+		testenv.Run(t, &testenv.Config{
+			NoRetryClient: true,
+			RouterOptions: []core.Option{
+				options,
+			},
+			Subgraphs: testenv.SubgraphsConfig{
+				Employees: testenv.SubgraphConfig{
+					Middleware: func(_ http.Handler) http.Handler {
+						return http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+							w.WriteHeader(http.StatusBadGateway)
+							employeesCalls.Add(1)
+						})
+					},
+				},
+				Test1: testenv.SubgraphConfig{
+					Middleware: func(handler http.Handler) http.Handler {
+						return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+							w.WriteHeader(http.StatusBadGateway)
+							test1Calls.Add(1)
+						})
+					},
+				},
+			},
+		}, func(t *testing.T, xEnv *testenv.Environment) {
+			// 1) Call employees-only query; expect employees subgraph to be retried employeesMax times (attempts = retries + 1)
+			resEmp, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{
+				Query: `query employees { employees { id } }`,
+			})
+			require.NoError(t, err)
+			require.Equal(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'employees', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"employees":null}}`, resEmp.Body)
+			require.Equal(t, int32(employeesMax+1), employeesCalls.Load())
+			require.Equal(t, int32(0), test1Calls.Load())
+
+			// 2) Call test1-only query; expect test subgraph to be retried test1Max times (attempts = retries + 1)
+			resTest1, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{
+				Query: `query { floatField(arg: 1.5) }`,
+			})
+
+			require.NoError(t, err)
+			require.Contains(t, resTest1.Body, `"statusCode":502`)
+			require.Equal(t, int32(employeesMax+1), employeesCalls.Load())
+			require.Equal(t, int32(test1Max+1), test1Calls.Load())
+		})
+	})
+
+	t.Run("verify retries are applied per subgraph when mixed configurations", func(t *testing.T) {
+		t.Parallel()
+
+		employeesCalls := atomic.Int32{}
+		test1Calls := atomic.Int32{}
+
+		// Configure per-subgraph retry: employees gets 3 retries, test1 gets 1 retry
+		employeesMax := 3
+		test1Max := 1
+		opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{
+			All: config.GlobalSubgraphRequestRule{
+				BackoffJitterRetry: config.BackoffJitterRetry{
+					Enabled:     true,
+					MaxAttempts: employeesMax,
+					MaxDuration: 2 * time.Second,
+					Interval:    10 * time.Millisecond,
+					Expression:  "true",
+				},
+			},
+			Subgraphs: map[string]config.GlobalSubgraphRequestRule{
+				"test1": {
+					BackoffJitterRetry: config.BackoffJitterRetry{
+						Enabled:     true,
+						MaxAttempts: test1Max,
+						MaxDuration: 2 * time.Second,
+						Interval:    10 * time.Millisecond,
+						Expression:  "true",
+					},
+				},
+			},
+		})
+		options := core.WithSubgraphRetryOptions(opts)
+
+		testenv.Run(t, &testenv.Config{
+			NoRetryClient: true,
+			RouterOptions: []core.Option{
+				options,
+			},
+			Subgraphs: testenv.SubgraphsConfig{
+				Employees: testenv.SubgraphConfig{
+					Middleware: func(_ http.Handler) http.Handler {
+						return http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+							w.WriteHeader(http.StatusBadGateway)
+							employeesCalls.Add(1)
+						})
+					},
+				},
+				Test1: testenv.SubgraphConfig{
+					Middleware: func(handler http.Handler) http.Handler {
+						return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+							w.WriteHeader(http.StatusBadGateway)
+							test1Calls.Add(1)
+						})
+					},
+				},
+			},
+		}, func(t *testing.T, xEnv *testenv.Environment) {
+			// 1) Call employees-only query; expect employees subgraph to be retried employeesMax times (attempts = retries + 1)
+			resEmp, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{
+				Query: `query employees { employees { id } }`,
+			})
+			require.NoError(t, err)
+			require.Equal(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'employees', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"employees":null}}`, resEmp.Body)
+			require.Equal(t, int32(employeesMax+1), employeesCalls.Load())
+			require.Equal(t, int32(0), test1Calls.Load())
+
+			// 2) Call test1-only query; expect test subgraph to be retried test1Max times (attempts = retries + 1)
+			resTest1, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{
+				Query: `query { floatField(arg: 1.5) }`,
+			})
+
+			require.NoError(t, err)
+			require.Contains(t, resTest1.Body, `"statusCode":502`)
+			require.Equal(t, int32(employeesMax+1), employeesCalls.Load())
+			require.Equal(t, int32(test1Max+1), test1Calls.Load())
+		})
+	})
+}
diff --git a/router-tests/structured_logging_test.go b/router-tests/structured_logging_test.go
index 7464fa26c9..b452c03c13 100644
--- a/router-tests/structured_logging_test.go
+++ b/router-tests/structured_logging_test.go
@@ -709,7 +709,7 @@ func TestFlakyAccessLogs(t *testing.T) {
 					EnableSingleFlight:     true,
 					MaxConcurrentResolvers: 1,
 				}),
-				core.WithSubgraphRetryOptions(false, "", 0, 0, 0, "", nil),
+				core.WithSubgraphRetryOptions(core.NewSubgraphRetryOptions(config.TrafficShapingRules{All: config.GlobalSubgraphRequestRule{BackoffJitterRetry: config.BackoffJitterRetry{Enabled: false}}})),
 			},
 			LogObservation: testenv.LogObservationConfig{
 				Enabled:  true,
@@ -828,7 +828,7 @@ func TestFlakyAccessLogs(t *testing.T) {
 					EnableSingleFlight:     true,
 					MaxConcurrentResolvers: 1,
 				}),
-				core.WithSubgraphRetryOptions(false, "", 0, 0, 0, "", nil),
+				core.WithSubgraphRetryOptions(core.NewSubgraphRetryOptions(config.TrafficShapingRules{All: config.GlobalSubgraphRequestRule{BackoffJitterRetry: config.BackoffJitterRetry{Enabled: false}}})),
 			},
 			LogObservation: testenv.LogObservationConfig{
 				Enabled:  true,
@@ -960,7 +960,7 @@ func TestFlakyAccessLogs(t *testing.T) {
 					EnableSingleFlight:     true,
 					MaxConcurrentResolvers: 1,
 				}),
-				core.WithSubgraphRetryOptions(false, "", 0, 0, 0, "", nil),
+				core.WithSubgraphRetryOptions(core.NewSubgraphRetryOptions(config.TrafficShapingRules{All: config.GlobalSubgraphRequestRule{BackoffJitterRetry: config.BackoffJitterRetry{Enabled: false}}})),
 			},
 			LogObservation: testenv.LogObservationConfig{
 				Enabled:  true,
@@ -1096,7 +1096,7 @@ func TestFlakyAccessLogs(t *testing.T) {
 					EnableSingleFlight:     true,
 					MaxConcurrentResolvers: 1,
 				}),
-				core.WithSubgraphRetryOptions(false, "", 0, 0, 0, "", nil),
+				core.WithSubgraphRetryOptions(core.NewSubgraphRetryOptions(config.TrafficShapingRules{All: config.GlobalSubgraphRequestRule{BackoffJitterRetry: config.BackoffJitterRetry{Enabled: false}}})),
 			},
 			LogObservation: testenv.LogObservationConfig{
 				Enabled:  true,
@@ -2211,7 +2211,7 @@ func TestFlakyAccessLogs(t *testing.T) {
 						EnableSingleFlight:     true,
 						MaxConcurrentResolvers: 1,
 					}),
-					core.WithSubgraphRetryOptions(false, "", 0, 0, 0, "", nil),
+					core.WithSubgraphRetryOptions(core.NewSubgraphRetryOptions(config.TrafficShapingRules{All: config.GlobalSubgraphRequestRule{BackoffJitterRetry: config.BackoffJitterRetry{Enabled: false}}})),
 				},
 				LogObservation: testenv.LogObservationConfig{
 					Enabled:  true,
diff --git a/router-tests/telemetry/telemetry_test.go b/router-tests/telemetry/telemetry_test.go
index 2b4987c0b7..205128a295 100644
--- a/router-tests/telemetry/telemetry_test.go
+++ b/router-tests/telemetry/telemetry_test.go
@@ -826,7 +826,6 @@ func TestFlakyOperationCacheTelemetry(t *testing.T) {
 			metricdatatest.AssertEqual(t, maxCostMetrics, *integration.GetMetricByName(cacheScope, "router.graphql.cache.cost.max"), metricdatatest.IgnoreTimestamp())
 		})
 	})
-
 	t.Run("Validate operation cache telemetry for persisted and non persisted operations", func(t *testing.T) {
 		t.Parallel()
 		metricReader := metric.NewManualReader()
@@ -1204,7 +1203,6 @@ func TestFlakyOperationCacheTelemetry(t *testing.T) {
 			metricdatatest.AssertEqual(t, maxCostMetrics, *integration.GetMetricByName(cacheScope, "router.graphql.cache.cost.max"), metricdatatest.IgnoreTimestamp())
 		})
 	})
-
 	t.Run("Validate operation cache telemetry when prometheus is also enabled", func(t *testing.T) {
 		t.Parallel()
 		metricReader := metric.NewManualReader()
@@ -1546,7 +1544,6 @@ func TestFlakyOperationCacheTelemetry(t *testing.T) {
 			metricdatatest.AssertEqual(t, maxCostMetrics, *integration.GetMetricByName(cacheScope, "router.graphql.cache.cost.max"), metricdatatest.IgnoreTimestamp())
 		})
 	})
-
 	t.Run("Validate key and cost eviction metrics with small validation cache size without feature flags", func(t *testing.T) {
 		t.Parallel()
 		metricReader := metric.NewManualReader()
@@ -1890,7 +1887,6 @@ func TestFlakyOperationCacheTelemetry(t *testing.T) {
 			metricdatatest.AssertEqual(t, maxCostMetrics, *integration.GetMetricByName(cacheScope, "router.graphql.cache.cost.max"), metricdatatest.IgnoreTimestamp())
 		})
 	})
-
 	t.Run("Validate operation cache telemetry for default configuration including feature flags", func(t *testing.T) {
 		t.Parallel()
 		metricReader := metric.NewManualReader()
@@ -3562,7 +3558,6 @@ func TestFlakyTelemetry(t *testing.T) {
 					},
 				},
 			}
-
 			want := metricdata.ScopeMetrics{
 				Scope: instrumentation.Scope{
 					Name:      "cosmo.router",
@@ -4087,7 +4082,6 @@ func TestFlakyTelemetry(t *testing.T) {
 			require.Contains(t, sn[1].Attributes(), otel.WgEnginePersistedOperationCacheHit.Bool(true))
 		})
 	})
-
 	t.Run("Should exclude high cardinality attributes only from metrics if custom exporter is defined", func(t *testing.T) {
 		t.Parallel()
 
@@ -4735,7 +4729,6 @@ func TestFlakyTelemetry(t *testing.T) {
 			require.Contains(t, rm.Resource.Attributes(), attribute.String("service.name", "cosmo-router"))
 
 			require.Len(t, rm.ScopeMetrics, defaultExposedScopedMetricsCount)
-
 			scopeMetric := *integration.GetMetricScopeByName(rm.ScopeMetrics, "cosmo.router")
 			require.Len(t, scopeMetric.Metrics, defaultCosmoRouterMetricsCount)
 
@@ -5106,7 +5099,6 @@ func TestFlakyTelemetry(t *testing.T) {
 
 		})
 	})
-
 	t.Run("Should remap metric name to configured value", func(t *testing.T) {
 		t.Parallel()
 
@@ -5447,7 +5439,6 @@ func TestFlakyTelemetry(t *testing.T) {
 			metricdatatest.AssertEqual(t, routerInfoMetric, scopeMetric.Metrics[6], metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue())
 		})
 	})
-
 	t.Run("Custom span and resource attributes are attached to all metrics and spans / from header", func(t *testing.T) {
 		t.Parallel()
 
@@ -5879,7 +5870,6 @@ func TestFlakyTelemetry(t *testing.T) {
 			metricdatatest.AssertEqual(t, routerInfoMetric, scopeMetric.Metrics[6], metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue())
 		})
 	})
-
 	t.Run("Custom span and resource attributes are attached to all metrics and spans / static", func(t *testing.T) {
 		t.Parallel()
 
@@ -6309,7 +6299,6 @@ func TestFlakyTelemetry(t *testing.T) {
 			metricdatatest.AssertEqual(t, routerInfoMetric, scopeMetric.Metrics[6], metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue())
 		})
 	})
-
 	t.Run("Requesting a feature flags will emit different router config version and add the feature flag attribute", func(t *testing.T) {
 		t.Parallel()
 
@@ -6950,7 +6939,6 @@ func TestFlakyTelemetry(t *testing.T) {
 			require.Equal(t, sdktrace.Status{Code: codes.Unset}, sn[8].Status())
 		})
 	})
-
 	t.Run("Origin connectivity issue is traced", func(t *testing.T) {
 		t.Parallel()
 
@@ -7532,7 +7520,6 @@ func TestFlakyTelemetry(t *testing.T) {
 			integration.AssertAttributeNotInSet(t, resClFiltered.DataPoints[1].Attributes, otel.WgOperationName.String(""))
 		})
 	})
-
 	t.Run("Custom Metric Attributes", func(t *testing.T) {
 		t.Parallel()
 
@@ -8104,7 +8091,6 @@ func TestFlakyTelemetry(t *testing.T) {
 				metricdatatest.AssertEqual(t, want, scopeMetric, metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue())
 			})
 		})
-
 		t.Run("should not remap high cardinality fields when using cloud exporter but include custom metric attributes", func(t *testing.T) {
 			t.Parallel()
 
@@ -8706,7 +8692,11 @@ func TestFlakyTelemetry(t *testing.T) {
 					},
 				},
 				RouterOptions: []core.Option{
-					core.WithSubgraphRetryOptions(false, "", 0, 0, 0, "", nil),
+					core.WithSubgraphRetryOptions(core.NewSubgraphRetryOptions(config.TrafficShapingRules{
+						All: config.GlobalSubgraphRequestRule{
+							BackoffJitterRetry: config.BackoffJitterRetry{Enabled: false},
+						},
+					})),
 				},
 				Subgraphs: testenv.SubgraphsConfig{
 					Products: testenv.SubgraphConfig{
@@ -8742,7 +8732,6 @@ func TestFlakyTelemetry(t *testing.T) {
 				require.True(t, found, "expected to find a datapoint with subgraph name and id in the metrics")
 			})
 		})
-
 		t.Run("Tracing is not affected by custom metric attributes", func(t *testing.T) {
 			t.Parallel()
 
@@ -9187,7 +9176,6 @@ func TestFlakyTelemetry(t *testing.T) {
 			assert.ErrorAs(t, err, &expectedErr)
 		})
 	})
-
 	t.Run("custom trace metrics with expression", func(t *testing.T) {
 		t.Parallel()
 
@@ -9592,7 +9580,6 @@ func TestFlakyTelemetry(t *testing.T) {
 			})
 		})
 	})
-
 	t.Run("verify attribute expressions with subgraph in the expression", func(t *testing.T) {
 		t.Run("verify subgraph expression should only be present for engine fetch", func(t *testing.T) {
 			t.Parallel()
@@ -9650,8 +9637,8 @@ func TestFlakyTelemetry(t *testing.T) {
 					require.IsType(t, metricdata.Sum[int64]{}, httpRequestsMetric.Data)
 
 					data2 := httpRequestsMetric.Data.(metricdata.Sum[int64])
-					attrs := data2.DataPoints[0].Attributes
-					_, ok := attrs.Value(attribute.Key(key))
+					atts := data2.DataPoints[0].Attributes
+					_, ok := atts.Value(attribute.Key(key))
 					require.False(t, ok)
 				})
 			})
@@ -10204,7 +10191,6 @@ func TestExcludeAttributesWithCustomExporter(t *testing.T) {
 					metricdatatest.AssertEqual(t, connectionMetrics, *integration.GetMetricByName(engineScope, "router.engine.connections"), metricdatatest.IgnoreTimestamp())
 				})
 			})
-
 			t.Run(fmt.Sprintf("cache metrics for %s", usingCustomExporter), func(t *testing.T) {
 				t.Parallel()
 				metricReader := metric.NewManualReader()
diff --git a/router/core/graph_server.go b/router/core/graph_server.go
index 875b0969d5..6079066580 100644
--- a/router/core/graph_server.go
+++ b/router/core/graph_server.go
@@ -40,6 +40,7 @@ import (
 	rmiddleware "github.com/wundergraph/cosmo/router/internal/middleware"
 	"github.com/wundergraph/cosmo/router/internal/recoveryhandler"
 	"github.com/wundergraph/cosmo/router/internal/requestlogger"
+	"github.com/wundergraph/cosmo/router/internal/retrytransport"
 	"github.com/wundergraph/cosmo/router/pkg/config"
 	"github.com/wundergraph/cosmo/router/pkg/cors"
 	"github.com/wundergraph/cosmo/router/pkg/execution_config"
@@ -100,6 +101,7 @@ type (
 		traceDialer             *TraceDialer
 		connector               *grpcconnector.Connector
 		circuitBreakerManager   *circuit.Manager
+		retryManager            *retrytransport.Manager
 	}
 )
 
@@ -261,6 +263,28 @@ func newGraphServer(ctx context.Context, r *Router, routerConfig *nodev1.RouterC
 		s.circuitBreakerManager = manager
 	}
 
+	if s.retryOptions.IsEnabled() {
+		retryExprManager := expr.NewRetryExpressionManager()
+		// Build retry options and handle any expression compilation errors
+		shouldRetryFunc, err := BuildRetryFunction(retryExprManager)
+		if err != nil {
+			return nil, fmt.Errorf("failed to build retry function: %w", err)
+		}
+
+		retryManager := retrytransport.NewManager(retryExprManager, shouldRetryFunc, s.retryOptions.OnRetryFunc)
+
+		err = retryManager.Initialize(
+			s.retryOptions.All,
+			s.retryOptions.SubgraphMap,
+			routerConfig.GetSubgraphs(),
+		)
+		if err != nil {
+			return nil, err
+		}
+
+		s.retryManager = retryManager
+	}
+
 	routingUrlGroupings, err := getRoutingUrlGroupingForCircuitBreakers(routerConfig, s.overrideRoutingURLConfiguration, s.overrides)
 	if err != nil {
 		return nil, err
@@ -1155,12 +1179,6 @@ func (s *graphServer) buildGraphMux(
 		baseConnMetricStore = s.connectionMetrics
 	}
 
-	// Build retry options and handle any expression compilation errors
-	processedRetryOptions, err := ProcessRetryOptions(s.retryOptions)
-	if err != nil {
-		return nil, fmt.Errorf("failed to process retry options: %w", err)
-	}
-
 	ecb := &ExecutorConfigurationBuilder{
 		introspection:    s.introspection,
 		baseURL:          s.baseURL,
@@ -1181,13 +1199,13 @@ func (s *graphServer) buildGraphMux(
 			PostHandlers:                  s.postOriginHandlers,
 			MetricStore:                   gm.metricStore,
 			ConnectionMetricStore:         baseConnMetricStore,
-			RetryOptions:                  *processedRetryOptions,
 			TracerProvider:                s.tracerProvider,
 			TracePropagators:              s.compositePropagator,
 			LocalhostFallbackInsideDocker: s.localhostFallbackInsideDocker,
 			Logger:                        s.logger,
 			EnableTraceClient:             enableTraceClient,
 			CircuitBreaker:                s.circuitBreakerManager,
+			RetryManager:                  s.retryManager,
 		},
 	}
 
diff --git a/router/core/retry_builder.go b/router/core/retry_builder.go
index 30fffcb53f..e563b58d3a 100644
--- a/router/core/retry_builder.go
+++ b/router/core/retry_builder.go
@@ -1,7 +1,6 @@
 package core
 
 import (
-	"fmt"
 	"net/http"
 	"strings"
 
@@ -10,74 +9,10 @@ import (
 	"go.uber.org/zap"
 )
 
-const (
-	defaultRetryExpression = "IsRetryableStatusCode() || IsConnectionError() || IsTimeout()"
-
-	backoffJitter = "backoff_jitter"
-)
-
-var noopRetryFunc = func(err error, req *http.Request, resp *http.Response) bool {
-	return false
-}
-
-func ProcessRetryOptions(retryOpts retrytransport.RetryOptions) (*retrytransport.RetryOptions, error) {
-	// Default to backOffJitter if no algorithm is specified
-	// This will occur either in tests or if the user explicitly makes it an empty string
-	if retryOpts.Algorithm == "" {
-		retryOpts.Algorithm = backoffJitter
-	}
-
-	// We skip validating the algorithm if retries are disabled
-	if retryOpts.Enabled && retryOpts.Algorithm != backoffJitter {
-		return nil, fmt.Errorf("unsupported retry algorithm: %s", retryOpts.Algorithm)
-	}
-
-	shouldRetryFunc, err := buildRetryFunction(retryOpts)
-	if err != nil {
-		return nil, fmt.Errorf("failed to build retry function: %w", err)
-	}
-
-	// Create copy to not mutate the original reference
-	retryOptions := retrytransport.RetryOptions{
-		Enabled:       retryOpts.Enabled,
-		Algorithm:     retryOpts.Algorithm,
-		MaxRetryCount: retryOpts.MaxRetryCount,
-		MaxDuration:   retryOpts.MaxDuration,
-		Interval:      retryOpts.Interval,
-		Expression:    retryOpts.Expression,
-
-		OnRetry: retryOpts.OnRetry,
-
-		ShouldRetry: shouldRetryFunc,
-	}
-
-	return &retryOptions, nil
-}
-
 // BuildRetryFunction creates a ShouldRetry function based on the provided expression
-func buildRetryFunction(retryOpts retrytransport.RetryOptions) (retrytransport.ShouldRetryFunc, error) {
-	// We do not need to build a retry function if retries are disabled
-	// This means that any bad expressions are ignored if retries are disabled
-	if !retryOpts.Enabled {
-		return noopRetryFunc, nil
-	}
-
-	// Use default expression if empty string is passed
-	expression := retryOpts.Expression
-	if expression == "" {
-		expression = defaultRetryExpression
-	}
-
-	// Create the retry expression manager
-	manager, err := expr.NewRetryExpressionManager(expression)
-	if err != nil {
-		return nil, fmt.Errorf("failed to create expression manager: %w", err)
-	}
-
-	// Return expression-based retry function
-	return func(err error, req *http.Request, resp *http.Response) bool {
+func BuildRetryFunction(manager *expr.RetryExpressionManager) (retrytransport.ShouldRetryFunc, error) {
+	return func(err error, req *http.Request, resp *http.Response, expression string) bool {
 		reqContext := getRequestContext(req.Context())
-
 		if reqContext == nil {
 			return false
 		}
@@ -95,7 +30,7 @@ func buildRetryFunction(retryOpts retrytransport.RetryOptions) (retrytransport.S
 		ctx := expr.LoadRetryContext(err, resp)
 
 		// Evaluate the expression
-		shouldRetry, evalErr := manager.ShouldRetry(ctx)
+		shouldRetry, evalErr := manager.ShouldRetry(ctx, expression)
 		if evalErr != nil {
 			reqContext.logger.Error("Failed to evaluate retry expression",
 				zap.Error(evalErr),
diff --git a/router/core/retry_builder_test.go b/router/core/retry_builder_test.go
index d4fa1cf3f6..6a2d34cdab 100644
--- a/router/core/retry_builder_test.go
+++ b/router/core/retry_builder_test.go
@@ -2,15 +2,13 @@ package core
 
 import (
 	"errors"
-	"fmt"
-	"github.com/wundergraph/cosmo/router/internal/retrytransport"
 	"io"
 	"net/http"
-	"reflect"
 	"syscall"
 	"testing"
 
 	"github.com/stretchr/testify/assert"
+	"github.com/wundergraph/cosmo/router/internal/expr"
 	"go.uber.org/zap"
 )
 
@@ -52,23 +50,19 @@ func createRequestWithContext(opType string) (*http.Request, *requestContext) {
 
 func TestBuildRetryFunction(t *testing.T) {
 	t.Run("build function when retry is disabled", func(t *testing.T) {
-		fn, err := buildRetryFunction(retrytransport.RetryOptions{
-			Enabled:    false,
-			Expression: "invalid expression ++++++",
-		})
+		manager := expr.NewRetryExpressionManager()
+		fn, err := BuildRetryFunction(manager)
 		assert.NoError(t, err)
-		assert.Equal(t,
-			reflect.ValueOf(noopRetryFunc).Pointer(),
-			reflect.ValueOf(fn).Pointer(),
-		)
+		assert.NotNil(t, fn)
 	})
 
 	t.Run("default expression behavior", func(t *testing.T) {
 		// Use the default expression that would be in the config
-		fn, err := buildRetryFunction(retrytransport.RetryOptions{
-			Enabled:    true,
-			Expression: defaultRetryExpression,
-		})
+		manager := expr.NewRetryExpressionManager()
+		err := manager.AddExpression("")
+		assert.NoError(t, err)
+
+		fn, err := BuildRetryFunction(manager)
 		assert.NoError(t, err)
 		assert.NotNil(t, fn)
 
@@ -77,25 +71,26 @@ func TestBuildRetryFunction(t *testing.T) {
 
 		// Test default behavior - should retry on 500
 		resp := &http.Response{StatusCode: 500}
-		assert.True(t, fn(nil, req, resp))
+		assert.True(t, fn(nil, req, resp, ""))
 
 		// Should not retry on 200
 		resp.StatusCode = 200
-		assert.False(t, fn(nil, req, resp))
+		assert.False(t, fn(nil, req, resp, ""))
 
 		// Test with errors - only expression-defined errors are handled here
-		assert.True(t, fn(syscall.ETIMEDOUT, req, nil))
-		assert.True(t, fn(errors.New("connection refused"), req, nil))
-		assert.True(t, fn(errors.New("unexpected EOF"), req, nil)) // EOF is now handled at transport layer, not expression
-		assert.False(t, fn(errors.New("some other error"), req, nil))
+		assert.True(t, fn(syscall.ETIMEDOUT, req, nil, ""))
+		assert.True(t, fn(errors.New("connection refused"), req, nil, ""))
+		assert.True(t, fn(errors.New("unexpected EOF"), req, nil, "")) // EOF is now handled at transport layer, not expression
+		assert.False(t, fn(errors.New("some other error"), req, nil, ""))
 	})
 
 	t.Run("expression-based retry", func(t *testing.T) {
 		expression := "statusCode == 500 || statusCode == 503"
-		fn, err := buildRetryFunction(retrytransport.RetryOptions{
-			Enabled:    true,
-			Expression: expression,
-		})
+		manager := expr.NewRetryExpressionManager()
+		err := manager.AddExpression(expression)
+		assert.NoError(t, err)
+
+		fn, err := BuildRetryFunction(manager)
 		assert.NoError(t, err)
 		assert.NotNil(t, fn)
 
@@ -104,23 +99,24 @@ func TestBuildRetryFunction(t *testing.T) {
 
 		// Should retry on 500
 		resp := &http.Response{StatusCode: 500}
-		assert.True(t, fn(nil, req, resp))
+		assert.True(t, fn(nil, req, resp, expression))
 
 		// Should retry on 503
 		resp.StatusCode = 503
-		assert.True(t, fn(nil, req, resp))
+		assert.True(t, fn(nil, req, resp, expression))
 
 		// Should not retry on 502
 		resp.StatusCode = 502
-		assert.False(t, fn(nil, req, resp))
+		assert.False(t, fn(nil, req, resp, expression))
 	})
 
 	t.Run("expression with error conditions", func(t *testing.T) {
 		expression := "IsTimeout() || statusCode == 503"
-		fn, err := buildRetryFunction(retrytransport.RetryOptions{
-			Enabled:    true,
-			Expression: expression,
-		})
+		manager := expr.NewRetryExpressionManager()
+		err := manager.AddExpression(expression)
+		assert.NoError(t, err)
+
+		fn, err := BuildRetryFunction(manager)
 		assert.NoError(t, err)
 		assert.NotNil(t, fn)
 
@@ -129,33 +125,31 @@ func TestBuildRetryFunction(t *testing.T) {
 
 		// Should retry on timeout error
 		err = syscall.ETIMEDOUT
-		assert.True(t, fn(err, req, nil))
+		assert.True(t, fn(err, req, nil, expression))
 
 		// Should retry on 503
 		resp := &http.Response{StatusCode: 503}
-		assert.True(t, fn(nil, req, resp))
+		assert.True(t, fn(nil, req, resp, expression))
 
 		// Should not retry on other errors
 		err = errors.New("some other error")
-		assert.False(t, fn(err, req, nil))
+		assert.False(t, fn(err, req, nil, expression))
 	})
 
 	t.Run("invalid expression returns error", func(t *testing.T) {
 		expression := "invalid syntax +++"
-		fn, err := buildRetryFunction(retrytransport.RetryOptions{
-			Enabled:    true,
-			Expression: expression,
-		})
+		manager := expr.NewRetryExpressionManager()
+		err := manager.AddExpression(expression)
 		assert.Error(t, err)
-		assert.Nil(t, fn)
 		assert.Contains(t, err.Error(), "failed to compile retry expression")
 	})
 
 	t.Run("empty expression uses default", func(t *testing.T) {
-		fn, err := buildRetryFunction(retrytransport.RetryOptions{
-			Enabled:    true,
-			Expression: "",
-		})
+		manager := expr.NewRetryExpressionManager()
+		err := manager.AddExpression("")
+		assert.NoError(t, err)
+
+		fn, err := BuildRetryFunction(manager)
 		assert.NoError(t, err)
 		assert.NotNil(t, fn)
 
@@ -164,42 +158,44 @@ func TestBuildRetryFunction(t *testing.T) {
 
 		// Test with retryable status code
 		resp := &http.Response{StatusCode: 502}
-		assert.True(t, fn(nil, req, resp))
+		assert.True(t, fn(nil, req, resp, ""))
 
 		// Test with connection error
 		err = errors.New("connection refused")
-		assert.True(t, fn(err, req, nil))
+		assert.True(t, fn(err, req, nil, ""))
 
 		// Test with timeout error
 		err = syscall.ETIMEDOUT
-		assert.True(t, fn(err, req, nil))
+		assert.True(t, fn(err, req, nil, ""))
 
 		// Test with non-retryable error
 		err = errors.New("some other error")
-		assert.False(t, fn(err, req, nil))
+		assert.False(t, fn(err, req, nil, ""))
 	})
 
 	t.Run("expression that always returns false but the error is an eof error", func(t *testing.T) {
 		expression := "false" // Don't retry
-		fn, err := buildRetryFunction(retrytransport.RetryOptions{
-			Enabled:    true,
-			Expression: expression,
-		})
+		manager := expr.NewRetryExpressionManager()
+		err := manager.AddExpression(expression)
+		assert.NoError(t, err)
+
+		fn, err := BuildRetryFunction(manager)
 		assert.NoError(t, err)
 		assert.NotNil(t, fn)
 
 		// Create a request with proper query context
 		req, _ := createRequestWithContext(OperationTypeQuery)
 
-		assert.True(t, fn(io.ErrUnexpectedEOF, req, nil))
+		assert.True(t, fn(io.ErrUnexpectedEOF, req, nil, expression))
 	})
 
 	t.Run("expression that always returns true", func(t *testing.T) {
 		expression := "true" // Always retry
-		fn, err := buildRetryFunction(retrytransport.RetryOptions{
-			Enabled:    true,
-			Expression: expression,
-		})
+		manager := expr.NewRetryExpressionManager()
+		err := manager.AddExpression(expression)
+		assert.NoError(t, err)
+
+		fn, err := BuildRetryFunction(manager)
 		assert.NoError(t, err)
 		assert.NotNil(t, fn)
 
@@ -208,19 +204,20 @@ func TestBuildRetryFunction(t *testing.T) {
 		resp := &http.Response{StatusCode: 500}
 
 		// Should retry when expression is true
-		assert.True(t, fn(nil, req, resp))
+		assert.True(t, fn(nil, req, resp, expression))
 
 		// Even for status codes that wouldn't normally retry
 		resp.StatusCode = 200
-		assert.True(t, fn(nil, req, resp))
+		assert.True(t, fn(nil, req, resp, expression))
 	})
 
 	t.Run("complex expression", func(t *testing.T) {
 		expression := "(statusCode >= 500 && statusCode < 600) || IsConnectionError()"
-		fn, err := buildRetryFunction(retrytransport.RetryOptions{
-			Enabled:    true,
-			Expression: expression,
-		})
+		manager := expr.NewRetryExpressionManager()
+		err := manager.AddExpression(expression)
+		assert.NoError(t, err)
+
+		fn, err := BuildRetryFunction(manager)
 		assert.NoError(t, err)
 		assert.NotNil(t, fn)
 
@@ -229,25 +226,26 @@ func TestBuildRetryFunction(t *testing.T) {
 
 		// Test 5xx errors
 		resp := &http.Response{StatusCode: 503}
-		assert.True(t, fn(nil, req, resp))
+		assert.True(t, fn(nil, req, resp, expression))
 
 		// Test connection error
 		err = errors.New("connection refused")
-		assert.True(t, fn(err, req, nil))
+		assert.True(t, fn(err, req, nil, expression))
 
 		// Test non-matching conditions
 		resp.StatusCode = 404
 		err = errors.New("some other error")
-		assert.False(t, fn(err, req, resp))
+		assert.False(t, fn(err, req, resp, expression))
 	})
 
 	t.Run("mutation never retries with proper context", func(t *testing.T) {
 		// Use expression that would normally retry on 500 errors
 		expression := "statusCode >= 500 || IsTimeout() || IsConnectionError()"
-		fn, err := buildRetryFunction(retrytransport.RetryOptions{
-			Enabled:    true,
-			Expression: expression,
-		})
+		manager := expr.NewRetryExpressionManager()
+		err := manager.AddExpression(expression)
+		assert.NoError(t, err)
+
+		fn, err := BuildRetryFunction(manager)
 		assert.NoError(t, err)
 		assert.NotNil(t, fn)
 
@@ -256,29 +254,28 @@ func TestBuildRetryFunction(t *testing.T) {
 
 		// Test with 500 status - should NOT retry because it's a mutation
 		resp := &http.Response{StatusCode: 500}
-		assert.False(t, fn(nil, req, resp))
+		assert.False(t, fn(nil, req, resp, expression))
 
 		// Test with timeout error - should NOT retry because it's a mutation
-		assert.False(t, fn(syscall.ETIMEDOUT, req, nil))
+		assert.False(t, fn(syscall.ETIMEDOUT, req, nil, expression))
 
 		// Test with connection error - should NOT retry because it's a mutation
-		assert.False(t, fn(errors.New("connection refused"), req, nil))
+		assert.False(t, fn(errors.New("connection refused"), req, nil, expression))
 
 		// Test with expression that always returns true - should still NOT retry
-		alwaysRetryFn, err := buildRetryFunction(retrytransport.RetryOptions{
-			Enabled:    true,
-			Expression: "true",
-		})
+		alwaysRetryExpression := "true"
+		err = manager.AddExpression(alwaysRetryExpression)
 		assert.NoError(t, err)
-		assert.False(t, alwaysRetryFn(nil, req, resp))
+		assert.False(t, fn(nil, req, resp, alwaysRetryExpression))
 	})
 
 	t.Run("query retries with proper context", func(t *testing.T) {
 		expression := "statusCode >= 500 || IsTimeout()"
-		fn, err := buildRetryFunction(retrytransport.RetryOptions{
-			Enabled:    true,
-			Expression: expression,
-		})
+		manager := expr.NewRetryExpressionManager()
+		err := manager.AddExpression(expression)
+		assert.NoError(t, err)
+
+		fn, err := BuildRetryFunction(manager)
 		assert.NoError(t, err)
 		assert.NotNil(t, fn)
 
@@ -287,22 +284,23 @@ func TestBuildRetryFunction(t *testing.T) {
 
 		// Test with 500 status - should retry because it's a query
 		resp := &http.Response{StatusCode: 500}
-		assert.True(t, fn(nil, req, resp))
+		assert.True(t, fn(nil, req, resp, expression))
 
 		// Test with timeout error - should retry because it's a query
-		assert.True(t, fn(syscall.ETIMEDOUT, req, nil))
+		assert.True(t, fn(syscall.ETIMEDOUT, req, nil, expression))
 
 		// Test with 200 status - should not retry even for query
 		resp.StatusCode = 200
-		assert.False(t, fn(nil, req, resp))
+		assert.False(t, fn(nil, req, resp, expression))
 	})
 
 	t.Run("subscription retries with proper context", func(t *testing.T) {
 		expression := "statusCode >= 500"
-		fn, err := buildRetryFunction(retrytransport.RetryOptions{
-			Enabled:    true,
-			Expression: expression,
-		})
+		manager := expr.NewRetryExpressionManager()
+		err := manager.AddExpression(expression)
+		assert.NoError(t, err)
+
+		fn, err := BuildRetryFunction(manager)
 		assert.NoError(t, err)
 		assert.NotNil(t, fn)
 
@@ -311,20 +309,21 @@ func TestBuildRetryFunction(t *testing.T) {
 
 		// Test with 500 status - should retry because it's a subscription (not mutation)
 		resp := &http.Response{StatusCode: 500}
-		assert.True(t, fn(nil, req, resp))
+		assert.True(t, fn(nil, req, resp, expression))
 
 		// Test with 200 status - should not retry
 		resp.StatusCode = 200
-		assert.False(t, fn(nil, req, resp))
+		assert.False(t, fn(nil, req, resp, expression))
 	})
 
 	t.Run("error logging with proper context", func(t *testing.T) {
 		// Test that error logging works with proper request context
 		expression := "statusCode >= 500"
-		fn, err := buildRetryFunction(retrytransport.RetryOptions{
-			Enabled:    true,
-			Expression: expression,
-		})
+		manager := expr.NewRetryExpressionManager()
+		err := manager.AddExpression(expression)
+		assert.NoError(t, err)
+
+		fn, err := BuildRetryFunction(manager)
 		assert.NoError(t, err)
 		assert.NotNil(t, fn)
 
@@ -333,18 +332,19 @@ func TestBuildRetryFunction(t *testing.T) {
 
 		// Test that it works normally with proper context
 		resp := &http.Response{StatusCode: 500}
-		assert.True(t, fn(nil, req, resp))
+		assert.True(t, fn(nil, req, resp, expression))
 
 		resp.StatusCode = 200
-		assert.False(t, fn(nil, req, resp))
+		assert.False(t, fn(nil, req, resp, expression))
 	})
 
 	t.Run("request context with query operation", func(t *testing.T) {
 		expression := "statusCode >= 500"
-		fn, err := buildRetryFunction(retrytransport.RetryOptions{
-			Enabled:    true,
-			Expression: expression,
-		})
+		manager := expr.NewRetryExpressionManager()
+		err := manager.AddExpression(expression)
+		assert.NoError(t, err)
+
+		fn, err := BuildRetryFunction(manager)
 		assert.NoError(t, err)
 		assert.NotNil(t, fn)
 
@@ -353,19 +353,20 @@ func TestBuildRetryFunction(t *testing.T) {
 
 		// Should work with proper request context - expression should be evaluated normally
 		resp := &http.Response{StatusCode: 500}
-		assert.True(t, fn(nil, req, resp))
+		assert.True(t, fn(nil, req, resp, expression))
 
 		resp.StatusCode = 200
-		assert.False(t, fn(nil, req, resp))
+		assert.False(t, fn(nil, req, resp, expression))
 	})
 
 	t.Run("complex expression with mutation context", func(t *testing.T) {
 		// Complex expression that would normally retry in many cases
 		expression := "(statusCode >= 500 && statusCode < 600) || IsConnectionError() || IsTimeout() || statusCode == 429"
-		fn, err := buildRetryFunction(retrytransport.RetryOptions{
-			Enabled:    true,
-			Expression: expression,
-		})
+		manager := expr.NewRetryExpressionManager()
+		err := manager.AddExpression(expression)
+		assert.NoError(t, err)
+
+		fn, err := BuildRetryFunction(manager)
 		assert.NoError(t, err)
 		assert.NotNil(t, fn)
 
@@ -374,25 +375,26 @@ func TestBuildRetryFunction(t *testing.T) {
 
 		// Test various conditions that would normally trigger retry
 		resp := &http.Response{StatusCode: 500}
-		assert.False(t, fn(nil, req, resp))
+		assert.False(t, fn(nil, req, resp, expression))
 
 		resp.StatusCode = 503
-		assert.False(t, fn(nil, req, resp))
+		assert.False(t, fn(nil, req, resp, expression))
 
 		resp.StatusCode = 429
-		assert.False(t, fn(nil, req, resp))
+		assert.False(t, fn(nil, req, resp, expression))
 
-		assert.False(t, fn(syscall.ETIMEDOUT, req, nil))
-		assert.False(t, fn(errors.New("connection refused"), req, nil))
+		assert.False(t, fn(syscall.ETIMEDOUT, req, nil, expression))
+		assert.False(t, fn(errors.New("connection refused"), req, nil, expression))
 	})
 
 	t.Run("new operation with comprehensive retry conditions", func(t *testing.T) {
 		// Create a new comprehensive operation to test all retry scenarios
 		expression := "statusCode >= 500 || statusCode == 429 || IsTimeout() || IsConnectionError()"
-		fn, err := buildRetryFunction(retrytransport.RetryOptions{
-			Enabled:    true,
-			Expression: expression,
-		})
+		manager := expr.NewRetryExpressionManager()
+		err := manager.AddExpression(expression)
+		assert.NoError(t, err)
+
+		fn, err := BuildRetryFunction(manager)
 		assert.NoError(t, err)
 		assert.NotNil(t, fn)
 
@@ -401,81 +403,82 @@ func TestBuildRetryFunction(t *testing.T) {
 
 		// Test 5xx errors
 		resp := &http.Response{StatusCode: 500}
-		assert.True(t, fn(nil, req, resp))
+		assert.True(t, fn(nil, req, resp, expression))
 		resp.StatusCode = 503
-		assert.True(t, fn(nil, req, resp))
+		assert.True(t, fn(nil, req, resp, expression))
 
 		// Test rate limiting
 		resp.StatusCode = 429
-		assert.True(t, fn(nil, req, resp))
+		assert.True(t, fn(nil, req, resp, expression))
 
 		// Test timeouts
-		assert.True(t, fn(syscall.ETIMEDOUT, req, nil))
+		assert.True(t, fn(syscall.ETIMEDOUT, req, nil, expression))
 
 		// Test connection errors
-		assert.True(t, fn(errors.New("connection refused"), req, nil))
+		assert.True(t, fn(errors.New("connection refused"), req, nil, expression))
 
 		// Test success - should not retry
 		resp.StatusCode = 200
-		assert.False(t, fn(nil, req, resp))
+		assert.False(t, fn(nil, req, resp, expression))
 
 		// Test client errors - should not retry
 		resp.StatusCode = 404
-		assert.False(t, fn(nil, req, resp))
+		assert.False(t, fn(nil, req, resp, expression))
 
 		// Now test the same conditions with a mutation - should never retry
 		mutationReq, _ := createRequestWithContext(OperationTypeMutation)
 
 		resp.StatusCode = 500
-		assert.False(t, fn(nil, mutationReq, resp))
+		assert.False(t, fn(nil, mutationReq, resp, expression))
 		resp.StatusCode = 503
-		assert.False(t, fn(nil, mutationReq, resp))
+		assert.False(t, fn(nil, mutationReq, resp, expression))
 		resp.StatusCode = 429
-		assert.False(t, fn(nil, mutationReq, resp))
-		assert.False(t, fn(syscall.ETIMEDOUT, mutationReq, nil))
-		assert.False(t, fn(errors.New("connection refused"), mutationReq, nil))
+		assert.False(t, fn(nil, mutationReq, resp, expression))
+		assert.False(t, fn(syscall.ETIMEDOUT, mutationReq, nil, expression))
+		assert.False(t, fn(errors.New("connection refused"), mutationReq, nil, expression))
 	})
 }
 
-func TestProcessRetryOptions(t *testing.T) {
-	t.Run("process invalid algorithm", func(t *testing.T) {
-		algorithm := "abcdee"
-		_, err := ProcessRetryOptions(retrytransport.RetryOptions{
-			Enabled:   true,
-			Algorithm: algorithm,
-		})
-
-		expectedError := fmt.Sprintf("unsupported retry algorithm: %s", algorithm)
-		assert.ErrorContains(t, err, expectedError)
-	})
-
-	t.Run("process invalid algorithm when retries are disabled", func(t *testing.T) {
-		algorithm := "abcdee"
-		_, err := ProcessRetryOptions(retrytransport.RetryOptions{
-			Enabled:   false,
-			Algorithm: algorithm,
-		})
-		assert.NoError(t, err)
-	})
-
-	t.Run("process invalid expression", func(t *testing.T) {
-		_, err := ProcessRetryOptions(retrytransport.RetryOptions{
-			Enabled:    true,
-			Algorithm:  "backoff_jitter",
-			Expression: "invalid syntax +++",
-		})
-
-		assert.ErrorContains(t, err, "failed to build retry function")
-	})
-
-	t.Run("process valid options", func(t *testing.T) {
-		options := retrytransport.RetryOptions{
-			Enabled:    true,
-			Algorithm:  "backoff_jitter",
-			Expression: "statusCode == 500 || IsTimeout() || IsConnectionError()",
-		}
-		response, err := ProcessRetryOptions(options)
-		assert.NoError(t, err)
-		assert.NotSame(t, &options, response)
-	})
-}
+// TODO: Fix
+//func TestProcessRetryOptions(t *testing.T) {
+//	t.Run("process invalid algorithm", func(t *testing.T) {
+//		algorithm := "abcdee"
+//		_, err := ProcessRetryOptions(retrytransport.RetryOptions{
+//			Enabled:   true,
+//			Algorithm: algorithm,
+//		})
+//
+//		expectedError := fmt.Sprintf("unsupported retry algorithm: %s", algorithm)
+//		assert.ErrorContains(t, err, expectedError)
+//	})
+//
+//	t.Run("process invalid algorithm when retries are disabled", func(t *testing.T) {
+//		algorithm := "abcdee"
+//		_, err := ProcessRetryOptions(retrytransport.RetryOptions{
+//			Enabled:   false,
+//			Algorithm: algorithm,
+//		})
+//		assert.NoError(t, err)
+//	})
+//
+//	t.Run("process invalid expression", func(t *testing.T) {
+//		_, err := ProcessRetryOptions(retrytransport.RetryOptions{
+//			Enabled:    true,
+//			Algorithm:  "backoff_jitter",
+//			Expression: "invalid syntax +++",
+//		})
+//
+//		assert.ErrorContains(t, err, "failed to build retry function")
+//	})
+//
+//	t.Run("process valid options", func(t *testing.T) {
+//		options := retrytransport.RetryOptions{
+//			Enabled:    true,
+//			Algorithm:  "backoff_jitter",
+//			Expression: "statusCode == 500 || IsTimeout() || IsConnectionError()",
+//		}
+//		response, err := ProcessRetryOptions(options)
+//		assert.NoError(t, err)
+//		assert.NotSame(t, &options, response)
+//	})
+//}
diff --git a/router/core/router.go b/router/core/router.go
index f5bed12df7..afccb42fbe 100644
--- a/router/core/router.go
+++ b/router/core/router.go
@@ -184,6 +184,27 @@ func (r *SubgraphCircuitBreakerOptions) IsEnabled() bool {
 	return r.CircuitBreaker.Enabled || len(r.SubgraphMap) > 0
 }
 
+type SubgraphRetryOptions struct {
+	All         retrytransport.RetryOptions
+	SubgraphMap map[string]retrytransport.RetryOptions
+	OnRetryFunc retrytransport.OnRetryFunc
+}
+
+func (r *SubgraphRetryOptions) IsEnabled() bool {
+	if r == nil {
+		return false
+	}
+	if r.All.Enabled {
+		return true
+	}
+	for _, cfg := range r.SubgraphMap {
+		if cfg.Enabled {
+			return true
+		}
+	}
+	return false
+}
+
 // NewRouter creates a new Router instance. Router.Start() must be called to start the server.
 // Alternatively, use Router.NewServer() to create a new server instance without starting it.
 func NewRouter(opts ...Option) (*Router, error) {
@@ -1752,26 +1773,9 @@ func WithSubgraphCircuitBreakerOptions(opts *SubgraphCircuitBreakerOptions) Opti
 	}
 }
 
-func WithSubgraphRetryOptions(
-	enabled bool,
-	algorithm string,
-	maxRetryCount int,
-	retryMaxDuration, retryInterval time.Duration,
-	expression string,
-	onRetryFunc retrytransport.OnRetryFunc,
-) Option {
+func WithSubgraphRetryOptions(opts *SubgraphRetryOptions) Option {
 	return func(r *Router) {
-		r.retryOptions = retrytransport.RetryOptions{
-			Enabled:       enabled,
-			Algorithm:     algorithm,
-			MaxRetryCount: maxRetryCount,
-			MaxDuration:   retryMaxDuration,
-			Interval:      retryInterval,
-			Expression:    expression,
-
-			// Test case overrides
-			OnRetry: onRetryFunc,
-		}
+		r.retryOptions = opts
 	}
 }
 
@@ -1901,6 +1905,37 @@ func NewSubgraphCircuitBreakerOptions(cfg config.TrafficShapingRules) *SubgraphC
 	return entry
 }
 
+func NewSubgraphRetryOptions(cfg config.TrafficShapingRules) *SubgraphRetryOptions {
+	entry := &SubgraphRetryOptions{
+		SubgraphMap: map[string]retrytransport.RetryOptions{},
+	}
+	// If we have a global default
+	if cfg.All.BackoffJitterRetry.Enabled {
+		entry.All = newRetryConfig(cfg.All.BackoffJitterRetry)
+	}
+	// Subgraph specific circuit breakers
+	for k, v := range cfg.Subgraphs {
+		entry.SubgraphMap[k] = newRetryConfig(v.BackoffJitterRetry)
+	}
+
+	return entry
+}
+
+func newRetryConfig(config config.BackoffJitterRetry) retrytransport.RetryOptions {
+	algorithm := config.Algorithm
+	if algorithm == "" {
+		algorithm = retrytransport.BackoffJitter
+	}
+	return retrytransport.RetryOptions{
+		Enabled:       config.Enabled,
+		Algorithm:     algorithm,
+		MaxRetryCount: config.MaxAttempts,
+		MaxDuration:   config.MaxDuration,
+		Interval:      config.Interval,
+		Expression:    config.Expression,
+	}
+}
+
 func newCircuitBreakerConfig(cb config.CircuitBreaker) circuit.CircuitBreakerConfig {
 	return circuit.CircuitBreakerConfig{
 		Enabled:                    cb.Enabled,
diff --git a/router/core/router_config.go b/router/core/router_config.go
index 2a2a78924d..e88ed342a4 100644
--- a/router/core/router_config.go
+++ b/router/core/router_config.go
@@ -9,7 +9,6 @@ import (
 	"github.com/wundergraph/cosmo/router/internal/graphqlmetrics"
 	"github.com/wundergraph/cosmo/router/internal/persistedoperation"
 	rd "github.com/wundergraph/cosmo/router/internal/persistedoperation/operationstorage/redis"
-	"github.com/wundergraph/cosmo/router/internal/retrytransport"
 	"github.com/wundergraph/cosmo/router/pkg/config"
 	"github.com/wundergraph/cosmo/router/pkg/controlplane/configpoller"
 	"github.com/wundergraph/cosmo/router/pkg/controlplane/selfregister"
@@ -77,12 +76,12 @@ type Config struct {
 	headerRules                     *config.HeaderRules
 	subgraphTransportOptions        *SubgraphTransportOptions
 	subgraphCircuitBreakerOptions   *SubgraphCircuitBreakerOptions
+	retryOptions                    *SubgraphRetryOptions
 	graphqlMetricsConfig            *GraphQLMetricsConfig
 	routerTrafficConfig             *config.RouterTrafficConfiguration
 	batchingConfig                  *BatchingConfig
 	fileUploadConfig                *config.FileUpload
 	accessController                *AccessController
-	retryOptions                    retrytransport.RetryOptions
 	redisClient                     rd.RDCloser
 	mcpServer                       *mcpserver.GraphQLSchemaServer
 	processStartTime                time.Time
@@ -219,7 +218,7 @@ func (c *Config) Usage() map[string]any {
 		usage["file_upload_max_files"] = c.fileUploadConfig.MaxFiles
 	}
 	usage["access_controller"] = c.accessController != nil
-	usage["retry_options"] = c.retryOptions.Enabled
+	usage["retry_options"] = c.retryOptions.IsEnabled()
 	usage["development_mode"] = c.developmentMode
 	usage["access_logs"] = c.accessLogsConfig != nil
 	usage["localhost_fallback_inside_docker"] = c.localhostFallbackInsideDocker
diff --git a/router/core/supervisor_instance.go b/router/core/supervisor_instance.go
index 09605f6d9e..2c9d1b3c86 100644
--- a/router/core/supervisor_instance.go
+++ b/router/core/supervisor_instance.go
@@ -193,15 +193,7 @@ func optionsFromResources(logger *zap.Logger, config *config.Config) []Option {
 		WithFileUploadConfig(&config.FileUpload),
 		WithSubgraphTransportOptions(NewSubgraphTransportOptions(config.TrafficShaping)),
 		WithSubgraphCircuitBreakerOptions(NewSubgraphCircuitBreakerOptions(config.TrafficShaping)),
-		WithSubgraphRetryOptions(
-			config.TrafficShaping.All.BackoffJitterRetry.Enabled,
-			config.TrafficShaping.All.BackoffJitterRetry.Algorithm,
-			config.TrafficShaping.All.BackoffJitterRetry.MaxAttempts,
-			config.TrafficShaping.All.BackoffJitterRetry.MaxDuration,
-			config.TrafficShaping.All.BackoffJitterRetry.Interval,
-			config.TrafficShaping.All.BackoffJitterRetry.Expression,
-			nil,
-		),
+		WithSubgraphRetryOptions(NewSubgraphRetryOptions(config.TrafficShaping)),
 		WithCors(&cors.Config{
 			Enabled:          config.CORS.Enabled,
 			AllowOrigins:     config.CORS.AllowOrigins,
diff --git a/router/core/transport.go b/router/core/transport.go
index 96c838689d..1f6b3c3aa5 100644
--- a/router/core/transport.go
+++ b/router/core/transport.go
@@ -66,11 +66,11 @@ type sfCacheItem struct {
 
 func NewCustomTransport(
 	baseRoundTripper http.RoundTripper,
-	retryOptions retrytransport.RetryOptions,
 	metricStore metric.Store,
 	connectionMetricStore metric.ConnectionMetricStore,
 	enableSingleFlight bool,
 	breaker *circuit.Manager,
+	retryManager *retrytransport.Manager,
 	enableTraceClient bool,
 ) *CustomTransport {
 	ct := &CustomTransport{
@@ -102,8 +102,8 @@ func NewCustomTransport(
 		baseRoundTripper = circuit.NewCircuitTripper(baseRoundTripper, breaker, getRequestContextLogger)
 	}
 
-	if retryOptions.Enabled {
-		ct.roundTripper = retrytransport.NewRetryHTTPTransport(baseRoundTripper, retryOptions, getRequestContextLogger)
+	if retryManager.IsEnabled() {
+		ct.roundTripper = retrytransport.NewRetryHTTPTransport(baseRoundTripper, getRequestContextLogger, retryManager)
 	} else {
 		ct.roundTripper = baseRoundTripper
 	}
@@ -339,11 +339,11 @@ func (ct *CustomTransport) singleFlightKey(req *http.Request) uint64 {
 type TransportFactory struct {
 	preHandlers                   []TransportPreHandler
 	postHandlers                  []TransportPostHandler
-	retryOptions                  retrytransport.RetryOptions
 	localhostFallbackInsideDocker bool
 	metricStore                   metric.Store
 	connectionMetricStore         metric.ConnectionMetricStore
 	circuitBreaker                *circuit.Manager
+	retryManager                  *retrytransport.Manager
 	logger                        *zap.Logger
 	tracerProvider                *sdktrace.TracerProvider
 	tracePropagators              propagation.TextMapPropagator
@@ -356,7 +356,6 @@ type TransportOptions struct {
 	PreHandlers                   []TransportPreHandler
 	PostHandlers                  []TransportPostHandler
 	SubgraphTransportOptions      *SubgraphTransportOptions
-	RetryOptions                  retrytransport.RetryOptions
 	LocalhostFallbackInsideDocker bool
 	MetricStore                   metric.Store
 	ConnectionMetricStore         metric.ConnectionMetricStore
@@ -365,6 +364,7 @@ type TransportOptions struct {
 	TracerProvider                *sdktrace.TracerProvider
 	TracePropagators              propagation.TextMapPropagator
 	EnableTraceClient             bool
+	RetryManager                  *retrytransport.Manager
 }
 
 type SubscriptionClientOptions struct {
@@ -378,7 +378,6 @@ func NewTransport(opts *TransportOptions) *TransportFactory {
 	return &TransportFactory{
 		preHandlers:                   opts.PreHandlers,
 		postHandlers:                  opts.PostHandlers,
-		retryOptions:                  opts.RetryOptions,
 		localhostFallbackInsideDocker: opts.LocalhostFallbackInsideDocker,
 		metricStore:                   opts.MetricStore,
 		connectionMetricStore:         opts.ConnectionMetricStore,
@@ -386,6 +385,7 @@ func NewTransport(opts *TransportOptions) *TransportFactory {
 		tracerProvider:                opts.TracerProvider,
 		tracePropagators:              opts.TracePropagators,
 		circuitBreaker:                opts.CircuitBreaker,
+		retryManager:                  opts.RetryManager,
 		enableTraceClient:             opts.EnableTraceClient,
 	}
 }
@@ -427,11 +427,11 @@ func (t TransportFactory) RoundTripper(enableSingleFlight bool, baseTransport ht
 	)
 	tp := NewCustomTransport(
 		traceTransport,
-		t.retryOptions,
 		t.metricStore,
 		t.connectionMetricStore,
 		enableSingleFlight,
 		t.circuitBreaker,
+		t.retryManager,
 		t.enableTraceClient,
 	)
 
diff --git a/router/internal/expr/retry_expression.go b/router/internal/expr/retry_expression.go
index 4414d29181..0d49664dd6 100644
--- a/router/internal/expr/retry_expression.go
+++ b/router/internal/expr/retry_expression.go
@@ -10,13 +10,27 @@ import (
 
 // RetryExpressionManager handles compilation and evaluation of retry expressions
 type RetryExpressionManager struct {
-	program *vm.Program
+	expressionMap map[string]*vm.Program
 }
 
-// NewRetryExpressionManager creates a new RetryExpressionManager with the given expression
-func NewRetryExpressionManager(expression string) (*RetryExpressionManager, error) {
+const defaultRetryExpression = "IsRetryableStatusCode() || IsConnectionError() || IsTimeout()"
+
+// NewRetryExpressionManager creates a new RetryExpressionManager
+func NewRetryExpressionManager() *RetryExpressionManager {
+	return &RetryExpressionManager{
+		expressionMap: make(map[string]*vm.Program),
+	}
+}
+
+func (c *RetryExpressionManager) AddExpression(exprString string) error {
+	expression := exprString
 	if expression == "" {
-		return nil, nil
+		expression = defaultRetryExpression
+	}
+
+	// The expression has already been processed, skip recompilation
+	if _, ok := c.expressionMap[expression]; ok {
+		return nil
 	}
 
 	// Compile the expression with retry context
@@ -27,22 +41,32 @@ func NewRetryExpressionManager(expression string) (*RetryExpressionManager, erro
 
 	program, err := expr.Compile(expression, options...)
 	if err != nil {
-		return nil, fmt.Errorf("failed to compile retry expression: %w", handleExpressionError(err))
+		return fmt.Errorf("failed to compile retry expression: %w", handleExpressionError(err))
 	}
 
-	return &RetryExpressionManager{
-		program: program,
-	}, nil
+	// Use the normalized expression string as the key for deduplication
+	c.expressionMap[expression] = program
+	return nil
 }
 
 // ShouldRetry evaluates the retry expression with the given context
-func (m *RetryExpressionManager) ShouldRetry(ctx RetryContext) (bool, error) {
-	if m == nil || m.program == nil {
-		// Use default behavior if no expression is configured
+func (m *RetryExpressionManager) ShouldRetry(ctx RetryContext, expressionString string) (bool, error) {
+	if m == nil {
+		return false, nil
+	}
+
+	expression := expressionString
+	if expression == "" {
+		expression = defaultRetryExpression
+	}
+
+	program, ok := m.expressionMap[expression]
+	if !ok {
+		// If the expression wasn't pre-compiled, do not retry by default
 		return false, nil
 	}
 
-	result, err := expr.Run(m.program, ctx)
+	result, err := expr.Run(program, ctx)
 	if err != nil {
 		return false, fmt.Errorf("failed to evaluate retry expression: %w", handleExpressionError(err))
 	}
diff --git a/router/internal/expr/retry_expression_test.go b/router/internal/expr/retry_expression_test.go
index 04a8c8c268..d57a9d37c5 100644
--- a/router/internal/expr/retry_expression_test.go
+++ b/router/internal/expr/retry_expression_test.go
@@ -122,7 +122,8 @@ func TestRetryExpressionManager(t *testing.T) {
 
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			manager, err := NewRetryExpressionManager(tt.expression)
+			manager := NewRetryExpressionManager()
+			err := manager.AddExpression(tt.expression)
 			if tt.expectErr {
 				assert.Error(t, err)
 				return
@@ -130,7 +131,7 @@ func TestRetryExpressionManager(t *testing.T) {
 			require.NoError(t, err)
 			require.NotNil(t, manager)
 
-			result, err := manager.ShouldRetry(tt.ctx)
+			result, err := manager.ShouldRetry(tt.ctx, tt.expression)
 			assert.NoError(t, err)
 			assert.Equal(t, tt.expected, result)
 		})
@@ -138,9 +139,10 @@ func TestRetryExpressionManager(t *testing.T) {
 }
 
 func TestRetryExpressionManager_EmptyExpression(t *testing.T) {
-	manager, err := NewRetryExpressionManager("")
+	manager := NewRetryExpressionManager()
+	err := manager.AddExpression("")
 	assert.NoError(t, err)
-	assert.Nil(t, manager)
+	assert.NotNil(t, manager)
 }
 
 func TestLoadRetryContext(t *testing.T) {
@@ -493,56 +495,56 @@ func (e *mockTimeoutError) Temporary() bool {
 func TestRetryExpressionManager_WithSyscallErrors(t *testing.T) {
 	t.Run("expression with specific syscall error helpers", func(t *testing.T) {
 		expression := "IsConnectionRefused() || IsConnectionReset() || IsTimeout()"
-		manager, err := NewRetryExpressionManager(expression)
-		require.NoError(t, err)
+		manager := NewRetryExpressionManager()
+		require.NoError(t, manager.AddExpression(expression))
 		require.NotNil(t, manager)
 
 		// Test ECONNREFUSED
 		ctx := LoadRetryContext(syscall.ECONNREFUSED, nil)
-		result, err := manager.ShouldRetry(ctx)
+		result, err := manager.ShouldRetry(ctx, expression)
 		assert.NoError(t, err)
 		assert.True(t, result)
 
 		// Test ECONNRESET
 		ctx = LoadRetryContext(syscall.ECONNRESET, nil)
-		result, err = manager.ShouldRetry(ctx)
+		result, err = manager.ShouldRetry(ctx, expression)
 		assert.NoError(t, err)
 		assert.True(t, result)
 
 		// Test ETIMEDOUT
 		ctx = LoadRetryContext(syscall.ETIMEDOUT, nil)
-		result, err = manager.ShouldRetry(ctx)
+		result, err = manager.ShouldRetry(ctx, expression)
 		assert.NoError(t, err)
 		assert.True(t, result)
 
 		// Test unrelated error
 		ctx = LoadRetryContext(errors.New("some other error"), nil)
-		result, err = manager.ShouldRetry(ctx)
+		result, err = manager.ShouldRetry(ctx, expression)
 		assert.NoError(t, err)
 		assert.False(t, result)
 	})
 
 	t.Run("expression combining status and syscall errors", func(t *testing.T) {
 		expression := "statusCode == 500 || IsConnectionRefused()"
-		manager, err := NewRetryExpressionManager(expression)
-		require.NoError(t, err)
+		manager := NewRetryExpressionManager()
+		require.NoError(t, manager.AddExpression(expression))
 		require.NotNil(t, manager)
 
 		// Test with status code
 		ctx := LoadRetryContext(nil, &http.Response{StatusCode: 500})
-		result, err := manager.ShouldRetry(ctx)
+		result, err := manager.ShouldRetry(ctx, expression)
 		assert.NoError(t, err)
 		assert.True(t, result)
 
 		// Test with syscall error
 		ctx = LoadRetryContext(syscall.ECONNREFUSED, nil)
-		result, err = manager.ShouldRetry(ctx)
+		result, err = manager.ShouldRetry(ctx, expression)
 		assert.NoError(t, err)
 		assert.True(t, result)
 
 		// Test with neither condition
 		ctx = LoadRetryContext(nil, &http.Response{StatusCode: 200})
-		result, err = manager.ShouldRetry(ctx)
+		result, err = manager.ShouldRetry(ctx, expression)
 		assert.NoError(t, err)
 		assert.False(t, result)
 	})
diff --git a/router/internal/retrytransport/manager.go b/router/internal/retrytransport/manager.go
new file mode 100644
index 0000000000..ba6b067d36
--- /dev/null
+++ b/router/internal/retrytransport/manager.go
@@ -0,0 +1,144 @@
+package retrytransport
+
+import (
+	"errors"
+	"fmt"
+	"net/http"
+	"sync"
+	"time"
+
+	nodev1 "github.com/wundergraph/cosmo/router/gen/proto/wg/cosmo/node/v1"
+	"github.com/wundergraph/cosmo/router/internal/expr"
+	"go.uber.org/zap"
+)
+
+type (
+	ShouldRetryFunc     func(err error, req *http.Request, resp *http.Response, exprString string) bool
+	OnRetryFunc         func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error)
+	requestLoggerGetter func(req *http.Request) *zap.Logger
+)
+
+const (
+	BackoffJitter = "backoff_jitter"
+)
+
+type RetryOptions struct {
+	Enabled       bool
+	Algorithm     string
+	MaxRetryCount int
+	Interval      time.Duration
+	MaxDuration   time.Duration
+	Expression    string
+}
+
+type Manager struct {
+	retries     map[string]*RetryOptions
+	lock        sync.RWMutex
+	exprManager *expr.RetryExpressionManager
+	retryFunc   ShouldRetryFunc
+	OnRetry     OnRetryFunc
+}
+
+func NewManager(exprManager *expr.RetryExpressionManager, retryFunc ShouldRetryFunc, onRetryFunc OnRetryFunc) *Manager {
+	return &Manager{
+		retries:     make(map[string]*RetryOptions),
+		exprManager: exprManager,
+		retryFunc:   retryFunc,
+		OnRetry:     onRetryFunc,
+	}
+}
+
+func (m *Manager) Initialize(
+	baseRetryOptions RetryOptions,
+	subgraphRetryOptions map[string]RetryOptions,
+	subgraphs []*nodev1.Subgraph,
+) error {
+	var joinErr error
+
+	defaultSgNames := make([]string, 0, len(subgraphs))
+	customSgNames := make([]string, 0, len(subgraphs))
+
+	for _, subgraph := range subgraphs {
+		entry, ok := subgraphRetryOptions[subgraph.Name]
+		if !ok {
+			defaultSgNames = append(defaultSgNames, subgraph.Name)
+		} else if entry.Enabled {
+			// This will cover the case of if a subgraph is explicitly disabled
+			customSgNames = append(customSgNames, subgraph.Name)
+		}
+	}
+
+	if len(defaultSgNames) > 0 && baseRetryOptions.Enabled {
+		if baseRetryOptions.Algorithm != BackoffJitter {
+			joinErr = errors.Join(joinErr, fmt.Errorf("unsupported retry algorithm: %s", baseRetryOptions.Algorithm))
+		} else {
+			err := m.exprManager.AddExpression(baseRetryOptions.Expression)
+			if err != nil {
+				joinErr = errors.Join(joinErr, fmt.Errorf("failed to add base retry expression: %w", err))
+			}
+		}
+	}
+
+	for _, sgName := range defaultSgNames {
+		m.retries[sgName] = &baseRetryOptions
+	}
+
+	for _, sgName := range customSgNames {
+		entry, ok := subgraphRetryOptions[sgName]
+		if !ok {
+			joinErr = errors.Join(joinErr, errors.New("retry config not found for subgraph "+sgName))
+			continue
+		}
+
+		if entry.Algorithm != BackoffJitter {
+			joinErr = errors.Join(joinErr, fmt.Errorf("unsupported retry algorithm: %s", baseRetryOptions.Algorithm))
+			continue
+		}
+
+		m.retries[sgName] = &entry
+
+		err := m.exprManager.AddExpression(entry.Expression)
+		if err != nil {
+			joinErr = errors.Join(joinErr, errors.New("retry expression did not get added "+sgName))
+			continue
+		}
+	}
+
+	return joinErr
+}
+
+func (c *Manager) GetSubgraphOptions(name string) *RetryOptions {
+	if c == nil {
+		return nil
+	}
+
+	c.lock.RLock()
+	defer c.lock.RUnlock()
+
+	if circuitBreaker, ok := c.retries[name]; ok {
+		return circuitBreaker
+	}
+	return nil
+}
+
+func (c *Manager) IsEnabled() bool {
+	if c == nil {
+		return false
+	}
+
+	c.lock.RLock()
+	defer c.lock.RUnlock()
+
+	return len(c.retries) > 0
+}
+
+func (c *Manager) Retry(err error, req *http.Request, resp *http.Response, exprString string) bool {
+	return c.retryFunc(err, req, resp, exprString)
+}
+
+// OnRetryHook triggers the configured OnRetry callback, if any.
+func (c *Manager) OnRetryHook(count int, err error, req *http.Request, resp *http.Response, sleepDuration time.Duration) {
+	if c.OnRetry != nil {
+		c.OnRetry(count, req, resp, sleepDuration, err)
+	}
+}
diff --git a/router/internal/retrytransport/retry_transport.go b/router/internal/retrytransport/retry_transport.go
index da0b262bea..43f60364a5 100644
--- a/router/internal/retrytransport/retry_transport.go
+++ b/router/internal/retrytransport/retry_transport.go
@@ -7,32 +7,16 @@ import (
 	"strconv"
 	"time"
 
+	rcontext "github.com/wundergraph/cosmo/router/internal/context"
+
 	"github.com/cloudflare/backoff"
 	"go.uber.org/zap"
 )
 
-type ShouldRetryFunc func(err error, req *http.Request, resp *http.Response) bool
-type OnRetryFunc func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error)
-
-type RetryOptions struct {
-	Enabled       bool
-	Algorithm     string
-	MaxRetryCount int
-	Interval      time.Duration
-	MaxDuration   time.Duration
-	Expression    string
-	ShouldRetry   ShouldRetryFunc
-
-	// Test specific only
-	OnRetry OnRetryFunc
-}
-
-type requestLoggerGetter func(req *http.Request) *zap.Logger
-
 type RetryHTTPTransport struct {
-	RoundTripper     http.RoundTripper
-	RetryOptions     RetryOptions
+	roundTripper     http.RoundTripper
 	getRequestLogger requestLoggerGetter
+	retryManager     *Manager
 }
 
 // parseRetryAfterHeader parses the Retry-After header value according to RFC 7231.
@@ -92,36 +76,50 @@ func shouldUseRetryAfter(logger *zap.Logger, resp *http.Response, maxDuration ti
 
 func NewRetryHTTPTransport(
 	roundTripper http.RoundTripper,
-	retryOptions RetryOptions,
 	getRequestLogger requestLoggerGetter,
+	retryManager *Manager,
 ) *RetryHTTPTransport {
 	return &RetryHTTPTransport{
-		RoundTripper:     roundTripper,
-		RetryOptions:     retryOptions,
+		roundTripper:     roundTripper,
 		getRequestLogger: getRequestLogger,
+		retryManager:     retryManager,
 	}
 }
 
 func (rt *RetryHTTPTransport) RoundTrip(req *http.Request) (*http.Response, error) {
-	resp, err := rt.RoundTripper.RoundTrip(req)
+	resp, err := rt.roundTripper.RoundTrip(req)
 	// Short circuit if the request was successful.
 	if err == nil && isResponseOK(resp) {
 		return resp, nil
 	}
 
-	b := backoff.New(rt.RetryOptions.MaxDuration, rt.RetryOptions.Interval)
+	var subgraph string
+	subgraphCtxVal := req.Context().Value(rcontext.CurrentSubgraphContextKey{})
+	if subgraphCtxVal != nil {
+		if sg, ok := subgraphCtxVal.(string); ok {
+			subgraph = sg
+		}
+	}
+
+	// If there is no option defined for this subgraph
+	retryOptions := rt.retryManager.GetSubgraphOptions(subgraph)
+	if retryOptions == nil {
+		return rt.roundTripper.RoundTrip(req)
+	}
+
+	b := backoff.New(retryOptions.MaxDuration, retryOptions.Interval)
 	defer b.Reset()
 
 	requestLogger := rt.getRequestLogger(req)
 
 	// Retry logic
 	retries := 0
-	for (rt.RetryOptions.ShouldRetry(err, req, resp)) && retries < rt.RetryOptions.MaxRetryCount {
+	for (rt.retryManager.Retry(err, req, resp, retryOptions.Expression)) && retries < retryOptions.MaxRetryCount {
 		retries++
 
 		// Check if we should use Retry-After header for 429 responses
 		var sleepDuration time.Duration
-		if retryAfterDuration, useRetryAfter := shouldUseRetryAfter(requestLogger, resp, rt.RetryOptions.MaxDuration); useRetryAfter {
+		if retryAfterDuration, useRetryAfter := shouldUseRetryAfter(requestLogger, resp, retryOptions.MaxDuration); useRetryAfter {
 			sleepDuration = retryAfterDuration
 			requestLogger.Debug("Using Retry-After header for 429 response",
 				zap.Int("retry", retries),
@@ -138,10 +136,8 @@ func (rt *RetryHTTPTransport) RoundTrip(req *http.Request) (*http.Response, erro
 			)
 		}
 
-		// Test Specific
-		if rt.RetryOptions.OnRetry != nil {
-			rt.RetryOptions.OnRetry(retries, req, resp, sleepDuration, err)
-		}
+		// Notify hook before sleeping
+		rt.retryManager.OnRetryHook(retries, err, req, resp, sleepDuration)
 
 		// Wait for the specified duration
 		time.Sleep(sleepDuration)
@@ -150,7 +146,7 @@ func (rt *RetryHTTPTransport) RoundTrip(req *http.Request) (*http.Response, erro
 		rt.drainBody(resp, requestLogger)
 
 		// Retry the request
-		resp, err = rt.RoundTripper.RoundTrip(req)
+		resp, err = rt.roundTripper.RoundTrip(req)
 
 		// Short circuit if the request was successful
 		if err == nil && isResponseOK(resp) {
diff --git a/router/internal/retrytransport/retry_transport_test.go b/router/internal/retrytransport/retry_transport_test.go
index e2d49d3f28..5aa9163d49 100644
--- a/router/internal/retrytransport/retry_transport_test.go
+++ b/router/internal/retrytransport/retry_transport_test.go
@@ -2,6 +2,7 @@ package retrytransport
 
 import (
 	"bytes"
+	"context"
 	"errors"
 	"fmt"
 	"io"
@@ -15,6 +16,9 @@ import (
 
 	"github.com/stretchr/testify/assert"
 	"go.uber.org/zap"
+
+	rcontext "github.com/wundergraph/cosmo/router/internal/context"
+	"github.com/wundergraph/cosmo/router/internal/expr"
 )
 
 const defaultMaxDuration = 100 * time.Second
@@ -51,13 +55,31 @@ func (dt *MockTransport) RoundTrip(req *http.Request) (*http.Response, error) {
 	return dt.handler(req)
 }
 
+func withSubgraph(req *http.Request, name string) *http.Request {
+	ctx := context.WithValue(req.Context(), rcontext.CurrentSubgraphContextKey{}, name)
+	return req.WithContext(ctx)
+}
+
+func newTestManager(shouldRetry func(error, *http.Request, *http.Response) bool, onRetry OnRetryFunc, opts RetryOptions, subgraphName string) *Manager {
+	mgr := NewManager(expr.NewRetryExpressionManager(), func(err error, req *http.Request, resp *http.Response, exprString string) bool {
+		return shouldRetry(err, req, resp)
+	}, onRetry)
+	// attach options for the subgraph
+	mgr.retries[subgraphName] = &opts
+	return mgr
+}
+
 func TestRetryOnHTTP5xx(t *testing.T) {
 	retries := 0
 	attemptCount := 0
 	maxRetries := 3
 
-	tr := RetryHTTPTransport{
-		RoundTripper: &MockTransport{
+	mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+		retries++
+	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg")
+
+	tr := NewRetryHTTPTransport(
+		&MockTransport{
 			handler: func(req *http.Request) (*http.Response, error) {
 				attemptCount++
 				if attemptCount <= maxRetries {
@@ -72,21 +94,11 @@ func TestRetryOnHTTP5xx(t *testing.T) {
 				}, nil
 			},
 		},
-		getRequestLogger: func(req *http.Request) *zap.Logger {
-			return zap.NewNop()
-		},
-		RetryOptions: RetryOptions{
-			MaxRetryCount: maxRetries,
-			Interval:      1 * time.Millisecond,
-			MaxDuration:   10 * time.Millisecond,
-			ShouldRetry:   simpleShouldRetry,
-			OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
-				retries++
-			},
-		},
-	}
+		func(req *http.Request) *zap.Logger { return zap.NewNop() },
+		mgr,
+	)
 
-	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
+	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
 
 	resp, err := tr.RoundTrip(req)
 	assert.Nil(t, err)
@@ -102,8 +114,12 @@ func TestRetryOnErrors(t *testing.T) {
 	attemptCount := 0
 	maxRetries := 3
 
-	tr := RetryHTTPTransport{
-		RoundTripper: &MockTransport{
+	mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+		retries++
+	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg")
+
+	tr := NewRetryHTTPTransport(
+		&MockTransport{
 			handler: func(req *http.Request) (*http.Response, error) {
 				attemptCount++
 				if attemptCount <= maxRetries {
@@ -116,21 +132,11 @@ func TestRetryOnErrors(t *testing.T) {
 				}, nil
 			},
 		},
-		getRequestLogger: func(req *http.Request) *zap.Logger {
-			return zap.NewNop()
-		},
-		RetryOptions: RetryOptions{
-			MaxRetryCount: maxRetries,
-			Interval:      1 * time.Millisecond,
-			MaxDuration:   10 * time.Millisecond,
-			ShouldRetry:   simpleShouldRetry,
-			OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
-				retries++
-			},
-		},
-	}
+		func(req *http.Request) *zap.Logger { return zap.NewNop() },
+		mgr,
+	)
 
-	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
+	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
 
 	resp, err := tr.RoundTrip(req)
 	assert.Nil(t, err)
@@ -156,8 +162,12 @@ func TestDoNotRetryWhenShouldRetryReturnsFalse(t *testing.T) {
 		return simpleShouldRetry(err, req, resp)
 	}
 
-	tr := RetryHTTPTransport{
-		RoundTripper: &MockTransport{
+	mgr := newTestManager(shouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+		retries++
+	}, RetryOptions{MaxRetryCount: maxRetryCount, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg")
+
+	tr := NewRetryHTTPTransport(
+		&MockTransport{
 			handler: func(req *http.Request) (*http.Response, error) {
 				attemptCount++
 				switch attemptCount {
@@ -173,21 +183,11 @@ func TestDoNotRetryWhenShouldRetryReturnsFalse(t *testing.T) {
 				}
 			},
 		},
-		getRequestLogger: func(req *http.Request) *zap.Logger {
-			return zap.NewNop()
-		},
-		RetryOptions: RetryOptions{
-			MaxRetryCount: maxRetryCount,
-			Interval:      1 * time.Millisecond,
-			MaxDuration:   10 * time.Millisecond,
-			ShouldRetry:   shouldRetry,
-			OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
-				retries++
-			},
-		},
-	}
+		func(req *http.Request) *zap.Logger { return zap.NewNop() },
+		mgr,
+	)
 
-	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
+	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
 
 	resp, err := tr.RoundTrip(req)
 	assert.Error(t, err)
@@ -232,8 +232,12 @@ func (b *TrackableBody) Close() error {
 func TestShortCircuitOnSuccess(t *testing.T) {
 	attemptCount := 0
 
-	tr := RetryHTTPTransport{
-		RoundTripper: &MockTransport{
+	mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+		t.Error("OnRetry should not be called when first request succeeds")
+	}, RetryOptions{MaxRetryCount: 5, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg")
+
+	tr := NewRetryHTTPTransport(
+		&MockTransport{
 			handler: func(req *http.Request) (*http.Response, error) {
 				attemptCount++
 				// Always return success
@@ -243,21 +247,11 @@ func TestShortCircuitOnSuccess(t *testing.T) {
 				}, nil
 			},
 		},
-		getRequestLogger: func(req *http.Request) *zap.Logger {
-			return zap.NewNop()
-		},
-		RetryOptions: RetryOptions{
-			MaxRetryCount: 5,
-			Interval:      1 * time.Millisecond,
-			MaxDuration:   10 * time.Millisecond,
-			ShouldRetry:   simpleShouldRetry,
-			OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
-				t.Error("OnRetry should not be called when first request succeeds")
-			},
-		},
-	}
+		func(req *http.Request) *zap.Logger { return zap.NewNop() },
+		mgr,
+	)
 
-	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
+	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
 	resp, err := tr.RoundTrip(req)
 
 	assert.NoError(t, err)
@@ -276,29 +270,23 @@ func TestMaxRetryCountRespected(t *testing.T) {
 	retries := 0
 	attemptCount := 0
 
-	tr := RetryHTTPTransport{
-		RoundTripper: &MockTransport{
+	mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+		retries++
+	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg")
+
+	tr := NewRetryHTTPTransport(
+		&MockTransport{
 			handler: func(req *http.Request) (*http.Response, error) {
 				attemptCount++
 				// Always return retryable error to test max retry limit
 				return nil, errors.New("always fail")
 			},
 		},
-		getRequestLogger: func(req *http.Request) *zap.Logger {
-			return zap.NewNop()
-		},
-		RetryOptions: RetryOptions{
-			MaxRetryCount: maxRetries,
-			Interval:      1 * time.Millisecond,
-			MaxDuration:   10 * time.Millisecond,
-			ShouldRetry:   simpleShouldRetry,
-			OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
-				retries++
-			},
-		},
-	}
+		func(req *http.Request) *zap.Logger { return zap.NewNop() },
+		mgr,
+	)
 
-	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
+	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
 	resp, err := tr.RoundTrip(req)
 
 	assert.Error(t, err)
@@ -323,8 +311,12 @@ func TestResponseBodyDraining(t *testing.T) {
 		}
 	}
 
-	tr := RetryHTTPTransport{
-		RoundTripper: &MockTransport{
+	mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+		actualRetries++
+	}, RetryOptions{MaxRetryCount: retryCount, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg")
+
+	tr := NewRetryHTTPTransport(
+		&MockTransport{
 			handler: func(req *http.Request) (*http.Response, error) {
 				index++
 				if index < retryCount {
@@ -340,21 +332,11 @@ func TestResponseBodyDraining(t *testing.T) {
 				}
 			},
 		},
-		getRequestLogger: func(req *http.Request) *zap.Logger {
-			return zap.NewNop()
-		},
-		RetryOptions: RetryOptions{
-			MaxRetryCount: retryCount,
-			Interval:      1 * time.Millisecond,
-			MaxDuration:   10 * time.Millisecond,
-			ShouldRetry:   simpleShouldRetry,
-			OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
-				actualRetries++
-			},
-		},
-	}
+		func(req *http.Request) *zap.Logger { return zap.NewNop() },
+		mgr,
+	)
 
-	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
+	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
 
 	resp, err := tr.RoundTrip(req)
 	assert.Nil(t, err)
@@ -398,8 +380,12 @@ func TestRequestLoggerIsUsed(t *testing.T) {
 		bodies[i] = trackableBody
 	}
 
-	tr := RetryHTTPTransport{
-		RoundTripper: &MockTransport{
+	mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+		actualRetries++
+	}, RetryOptions{MaxRetryCount: retryCount, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg")
+
+	tr := NewRetryHTTPTransport(
+		&MockTransport{
 			handler: func(req *http.Request) (*http.Response, error) {
 				index++
 				if index < retryCount {
@@ -415,21 +401,11 @@ func TestRequestLoggerIsUsed(t *testing.T) {
 				}
 			},
 		},
-		getRequestLogger: func(req *http.Request) *zap.Logger {
-			return requestLogger
-		},
-		RetryOptions: RetryOptions{
-			MaxRetryCount: retryCount,
-			Interval:      1 * time.Millisecond,
-			MaxDuration:   10 * time.Millisecond,
-			ShouldRetry:   simpleShouldRetry,
-			OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
-				actualRetries++
-			},
-		},
-	}
+		func(req *http.Request) *zap.Logger { return requestLogger },
+		mgr,
+	)
 
-	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
+	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
 
 	tr.RoundTrip(req)
 
@@ -459,8 +435,17 @@ func TestOnRetryCallbackInvoked(t *testing.T) {
 		resp  *http.Response
 	}
 
-	tr := RetryHTTPTransport{
-		RoundTripper: &MockTransport{
+	mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+		retries++
+		retryCallbacks = append(retryCallbacks, struct {
+			count int
+			err   error
+			resp  *http.Response
+		}{count: count, err: err, resp: resp})
+	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg")
+
+	tr := NewRetryHTTPTransport(
+		&MockTransport{
 			handler: func(req *http.Request) (*http.Response, error) {
 				if retries < maxRetries {
 					// Return retryable error
@@ -472,26 +457,11 @@ func TestOnRetryCallbackInvoked(t *testing.T) {
 				}, nil
 			},
 		},
-		getRequestLogger: func(req *http.Request) *zap.Logger {
-			return zap.NewNop()
-		},
-		RetryOptions: RetryOptions{
-			MaxRetryCount: maxRetries,
-			Interval:      1 * time.Millisecond,
-			MaxDuration:   10 * time.Millisecond,
-			ShouldRetry:   simpleShouldRetry,
-			OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
-				retries++
-				retryCallbacks = append(retryCallbacks, struct {
-					count int
-					err   error
-					resp  *http.Response
-				}{count: count, err: err, resp: resp})
-			},
-		},
-	}
+		func(req *http.Request) *zap.Logger { return zap.NewNop() },
+		mgr,
+	)
 
-	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
+	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
 	resp, err := tr.RoundTrip(req)
 
 	assert.NoError(t, err)
@@ -519,8 +489,12 @@ func TestRetryOn429WithDelaySeconds(t *testing.T) {
 	// Track what retry duration was requested to verify Retry-After is parsed correctly
 	var retryAfterUsed []bool
 
-	tr := RetryHTTPTransport{
-		RoundTripper: &MockTransport{
+	mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+		retries++
+	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 100 * time.Millisecond, MaxDuration: 10 * time.Second}, "sg")
+
+	tr := NewRetryHTTPTransport(
+		&MockTransport{
 			handler: func(req *http.Request) (*http.Response, error) {
 				attemptCount++
 				if attemptCount <= maxRetries {
@@ -545,21 +519,11 @@ func TestRetryOn429WithDelaySeconds(t *testing.T) {
 				}, nil
 			},
 		},
-		getRequestLogger: func(req *http.Request) *zap.Logger {
-			return zap.NewNop()
-		},
-		RetryOptions: RetryOptions{
-			MaxRetryCount: maxRetries,
-			Interval:      100 * time.Millisecond, // This should be ignored for 429
-			MaxDuration:   10 * time.Second,
-			ShouldRetry:   shouldRetryWith429,
-			OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
-				retries++
-			},
-		},
-	}
+		func(req *http.Request) *zap.Logger { return zap.NewNop() },
+		mgr,
+	)
 
-	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
+	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
 	resp, err := tr.RoundTrip(req)
 
 	assert.NoError(t, err)
@@ -583,8 +547,12 @@ func TestRetryOn429WithDelaySecondsLargerThanMaxDuration(t *testing.T) {
 	// Track what retry duration was requested to verify Retry-After is parsed correctly
 	var retryAfterUsed []bool
 
-	tr := RetryHTTPTransport{
-		RoundTripper: &MockTransport{
+	mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+		retries++
+	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 100 * time.Millisecond, MaxDuration: 10 * time.Second}, "sg")
+
+	tr := NewRetryHTTPTransport(
+		&MockTransport{
 			handler: func(req *http.Request) (*http.Response, error) {
 				attemptCount++
 				if attemptCount <= maxRetries {
@@ -609,21 +577,11 @@ func TestRetryOn429WithDelaySecondsLargerThanMaxDuration(t *testing.T) {
 				}, nil
 			},
 		},
-		getRequestLogger: func(req *http.Request) *zap.Logger {
-			return zap.NewNop()
-		},
-		RetryOptions: RetryOptions{
-			MaxRetryCount: maxRetries,
-			Interval:      100 * time.Millisecond, // This should be ignored for 429
-			MaxDuration:   10 * time.Second,
-			ShouldRetry:   shouldRetryWith429,
-			OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
-				retries++
-			},
-		},
-	}
+		func(req *http.Request) *zap.Logger { return zap.NewNop() },
+		mgr,
+	)
 
-	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
+	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
 	resp, err := tr.RoundTrip(req)
 
 	assert.NoError(t, err)
@@ -642,8 +600,12 @@ func TestRetryOn429WithoutRetryAfter(t *testing.T) {
 	attemptCount := 0
 	maxRetries := 2
 
-	tr := RetryHTTPTransport{
-		RoundTripper: &MockTransport{
+	mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+		retries++
+	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg")
+
+	tr := NewRetryHTTPTransport(
+		&MockTransport{
 			handler: func(req *http.Request) (*http.Response, error) {
 				attemptCount++
 				if attemptCount <= maxRetries {
@@ -659,21 +621,11 @@ func TestRetryOn429WithoutRetryAfter(t *testing.T) {
 				}, nil
 			},
 		},
-		getRequestLogger: func(req *http.Request) *zap.Logger {
-			return zap.NewNop()
-		},
-		RetryOptions: RetryOptions{
-			MaxRetryCount: maxRetries,
-			Interval:      1 * time.Millisecond,
-			MaxDuration:   10 * time.Millisecond,
-			ShouldRetry:   shouldRetryWith429,
-			OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
-				retries++
-			},
-		},
-	}
+		func(req *http.Request) *zap.Logger { return zap.NewNop() },
+		mgr,
+	)
 
-	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
+	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
 	resp, err := tr.RoundTrip(req)
 
 	assert.NoError(t, err)
@@ -692,8 +644,12 @@ func TestRetryOn429WithHTTPDate(t *testing.T) {
 	var retryAfterUsed []bool
 	var expectedDuration time.Duration
 
-	tr := RetryHTTPTransport{
-		RoundTripper: &MockTransport{
+	mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+		retries++
+	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 100 * time.Millisecond, MaxDuration: 10 * time.Second}, "sg")
+
+	tr := NewRetryHTTPTransport(
+		&MockTransport{
 			handler: func(req *http.Request) (*http.Response, error) {
 				attemptCount++
 				if attemptCount <= maxRetries {
@@ -722,21 +678,11 @@ func TestRetryOn429WithHTTPDate(t *testing.T) {
 				}, nil
 			},
 		},
-		getRequestLogger: func(req *http.Request) *zap.Logger {
-			return zap.NewNop()
-		},
-		RetryOptions: RetryOptions{
-			MaxRetryCount: maxRetries,
-			Interval:      100 * time.Millisecond, // This should be ignored for 429
-			MaxDuration:   10 * time.Second,
-			ShouldRetry:   shouldRetryWith429,
-			OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
-				retries++
-			},
-		},
-	}
+		func(req *http.Request) *zap.Logger { return zap.NewNop() },
+		mgr,
+	)
 
-	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
+	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
 	resp, err := tr.RoundTrip(req)
 
 	assert.NoError(t, err)
@@ -755,8 +701,12 @@ func TestRetryOn429WithInvalidRetryAfterHeader(t *testing.T) {
 	attemptCount := 0
 	maxRetries := 2
 
-	tr := RetryHTTPTransport{
-		RoundTripper: &MockTransport{
+	mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+		retries++
+	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg")
+
+	tr := NewRetryHTTPTransport(
+		&MockTransport{
 			handler: func(req *http.Request) (*http.Response, error) {
 				attemptCount++
 				if attemptCount <= maxRetries {
@@ -774,21 +724,11 @@ func TestRetryOn429WithInvalidRetryAfterHeader(t *testing.T) {
 				}, nil
 			},
 		},
-		getRequestLogger: func(req *http.Request) *zap.Logger {
-			return zap.NewNop()
-		},
-		RetryOptions: RetryOptions{
-			MaxRetryCount: maxRetries,
-			Interval:      1 * time.Millisecond,
-			MaxDuration:   10 * time.Millisecond,
-			ShouldRetry:   shouldRetryWith429,
-			OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
-				retries++
-			},
-		},
-	}
+		func(req *http.Request) *zap.Logger { return zap.NewNop() },
+		mgr,
+	)
 
-	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
+	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
 	resp, err := tr.RoundTrip(req)
 
 	assert.NoError(t, err)
@@ -804,8 +744,12 @@ func TestRetryOn429WithNegativeDelaySeconds(t *testing.T) {
 	attemptCount := 0
 	maxRetries := 2
 
-	tr := RetryHTTPTransport{
-		RoundTripper: &MockTransport{
+	mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+		retries++
+	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg")
+
+	tr := NewRetryHTTPTransport(
+		&MockTransport{
 			handler: func(req *http.Request) (*http.Response, error) {
 				attemptCount++
 				if attemptCount <= maxRetries {
@@ -823,21 +767,11 @@ func TestRetryOn429WithNegativeDelaySeconds(t *testing.T) {
 				}, nil
 			},
 		},
-		getRequestLogger: func(req *http.Request) *zap.Logger {
-			return zap.NewNop()
-		},
-		RetryOptions: RetryOptions{
-			MaxRetryCount: maxRetries,
-			Interval:      1 * time.Millisecond,
-			MaxDuration:   10 * time.Millisecond,
-			ShouldRetry:   shouldRetryWith429,
-			OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
-				retries++
-			},
-		},
-	}
+		func(req *http.Request) *zap.Logger { return zap.NewNop() },
+		mgr,
+	)
 
-	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
+	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
 	resp, err := tr.RoundTrip(req)
 
 	assert.NoError(t, err)
@@ -855,8 +789,12 @@ func TestRetryMixed429AndOtherErrors(t *testing.T) {
 	// Track which responses used Retry-After vs normal backoff
 	var retryAfterUsedPerAttempt []bool
 
-	tr := RetryHTTPTransport{
-		RoundTripper: &MockTransport{
+	mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+		retries++
+	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 10 * time.Millisecond, MaxDuration: 10 * time.Second}, "sg")
+
+	tr := NewRetryHTTPTransport(
+		&MockTransport{
 			handler: func(req *http.Request) (*http.Response, error) {
 				attemptCount++
 				switch attemptCount {
@@ -902,21 +840,11 @@ func TestRetryMixed429AndOtherErrors(t *testing.T) {
 				}
 			},
 		},
-		getRequestLogger: func(req *http.Request) *zap.Logger {
-			return zap.NewNop()
-		},
-		RetryOptions: RetryOptions{
-			MaxRetryCount: maxRetries,
-			Interval:      10 * time.Millisecond, // Should be used for non-429-with-Retry-After cases
-			MaxDuration:   10 * time.Second,
-			ShouldRetry:   shouldRetryWith429,
-			OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
-				retries++
-			},
-		},
-	}
+		func(req *http.Request) *zap.Logger { return zap.NewNop() },
+		mgr,
+	)
 
-	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
+	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
 	resp, err := tr.RoundTrip(req)
 
 	assert.NoError(t, err)
@@ -945,8 +873,12 @@ func TestNoRetryOn429WhenShouldRetryReturnsFalse(t *testing.T) {
 		return err != nil
 	}
 
-	tr := RetryHTTPTransport{
-		RoundTripper: &MockTransport{
+	mgr := newTestManager(shouldNotRetry429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+		retries++
+	}, RetryOptions{MaxRetryCount: 3, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg")
+
+	tr := NewRetryHTTPTransport(
+		&MockTransport{
 			handler: func(req *http.Request) (*http.Response, error) {
 				attemptCount++
 				// Always return 429 with Retry-After header
@@ -958,21 +890,11 @@ func TestNoRetryOn429WhenShouldRetryReturnsFalse(t *testing.T) {
 				return resp, nil
 			},
 		},
-		getRequestLogger: func(req *http.Request) *zap.Logger {
-			return zap.NewNop()
-		},
-		RetryOptions: RetryOptions{
-			MaxRetryCount: 3,
-			Interval:      1 * time.Millisecond,
-			MaxDuration:   10 * time.Millisecond,
-			ShouldRetry:   shouldNotRetry429,
-			OnRetry: func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
-				retries++
-			},
-		},
-	}
+		func(req *http.Request) *zap.Logger { return zap.NewNop() },
+		mgr,
+	)
 
-	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
+	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
 	resp, err := tr.RoundTrip(req)
 
 	assert.NoError(t, err)

From 39a200c8aeabfaccc6f220f014c9a214cb035b14 Mon Sep 17 00:00:00 2001
From: Milinda Dias <skay.archon@gmail.com>
Date: Mon, 1 Sep 2025 19:07:49 +0530
Subject: [PATCH 02/24] fix: review comments

---
 router/internal/retrytransport/manager.go     | 39 +++++++++++--------
 .../retrytransport/retry_transport.go         |  2 +-
 2 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/router/internal/retrytransport/manager.go b/router/internal/retrytransport/manager.go
index ba6b067d36..8121c619da 100644
--- a/router/internal/retrytransport/manager.go
+++ b/router/internal/retrytransport/manager.go
@@ -53,6 +53,9 @@ func (m *Manager) Initialize(
 	subgraphRetryOptions map[string]RetryOptions,
 	subgraphs []*nodev1.Subgraph,
 ) error {
+	m.lock.Lock()
+	defer m.lock.Unlock()
+
 	var joinErr error
 
 	defaultSgNames := make([]string, 0, len(subgraphs))
@@ -95,7 +98,8 @@ func (m *Manager) Initialize(
 			continue
 		}
 
-		m.retries[sgName] = &entry
+		opts := entry
+		m.retries[sgName] = &opts
 
 		err := m.exprManager.AddExpression(entry.Expression)
 		if err != nil {
@@ -107,38 +111,41 @@ func (m *Manager) Initialize(
 	return joinErr
 }
 
-func (c *Manager) GetSubgraphOptions(name string) *RetryOptions {
-	if c == nil {
+func (m *Manager) GetSubgraphOptions(name string) *RetryOptions {
+	if m == nil {
 		return nil
 	}
 
-	c.lock.RLock()
-	defer c.lock.RUnlock()
+	m.lock.RLock()
+	defer m.lock.RUnlock()
 
-	if circuitBreaker, ok := c.retries[name]; ok {
+	if circuitBreaker, ok := m.retries[name]; ok {
 		return circuitBreaker
 	}
 	return nil
 }
 
-func (c *Manager) IsEnabled() bool {
-	if c == nil {
+func (m *Manager) IsEnabled() bool {
+	if m == nil {
 		return false
 	}
 
-	c.lock.RLock()
-	defer c.lock.RUnlock()
+	m.lock.RLock()
+	defer m.lock.RUnlock()
 
-	return len(c.retries) > 0
+	return len(m.retries) > 0
 }
 
-func (c *Manager) Retry(err error, req *http.Request, resp *http.Response, exprString string) bool {
-	return c.retryFunc(err, req, resp, exprString)
+func (m *Manager) Retry(err error, req *http.Request, resp *http.Response, exprString string) bool {
+	if m.retryFunc == nil {
+		return false
+	}
+	return m.retryFunc(err, req, resp, exprString)
 }
 
 // OnRetryHook triggers the configured OnRetry callback, if any.
-func (c *Manager) OnRetryHook(count int, err error, req *http.Request, resp *http.Response, sleepDuration time.Duration) {
-	if c.OnRetry != nil {
-		c.OnRetry(count, req, resp, sleepDuration, err)
+func (m *Manager) OnRetryHook(count int, err error, req *http.Request, resp *http.Response, sleepDuration time.Duration) {
+	if m.OnRetry != nil {
+		m.OnRetry(count, req, resp, sleepDuration, err)
 	}
 }
diff --git a/router/internal/retrytransport/retry_transport.go b/router/internal/retrytransport/retry_transport.go
index 43f60364a5..8258e263d0 100644
--- a/router/internal/retrytransport/retry_transport.go
+++ b/router/internal/retrytransport/retry_transport.go
@@ -104,7 +104,7 @@ func (rt *RetryHTTPTransport) RoundTrip(req *http.Request) (*http.Response, erro
 	// If there is no option defined for this subgraph
 	retryOptions := rt.retryManager.GetSubgraphOptions(subgraph)
 	if retryOptions == nil {
-		return rt.roundTripper.RoundTrip(req)
+		return resp, nil
 	}
 
 	b := backoff.New(retryOptions.MaxDuration, retryOptions.Interval)

From a16a8f971d26bf1fd084855c9620612f7097124b Mon Sep 17 00:00:00 2001
From: Milinda Dias <skay.archon@gmail.com>
Date: Mon, 1 Sep 2025 19:10:18 +0530
Subject: [PATCH 03/24] fix: add test

---
 router-tests/retry_test.go | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/router-tests/retry_test.go b/router-tests/retry_test.go
index 010b3ab1d8..4ce9be0bca 100644
--- a/router-tests/retry_test.go
+++ b/router-tests/retry_test.go
@@ -530,6 +530,33 @@ func TestRetryPerSubgraph(t *testing.T) {
 		require.ErrorContains(t, err, "unsupported retry algorithm")
 	})
 
+	t.Run("invalid algorithm is ignored when retries are disabled (base)", func(t *testing.T) {
+		t.Parallel()
+
+		opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{
+			All: config.GlobalSubgraphRequestRule{
+				BackoffJitterRetry: config.BackoffJitterRetry{
+					Enabled:     false,
+					Algorithm:   "invalid_algorithm",
+					MaxAttempts: 2,
+					MaxDuration: 2 * time.Second,
+					Interval:    10 * time.Millisecond,
+					Expression:  "true",
+				},
+			},
+		})
+		options := core.WithSubgraphRetryOptions(opts)
+
+		err := testenv.RunWithError(t, &testenv.Config{
+			NoRetryClient: true,
+			RouterOptions: []core.Option{
+				options,
+			},
+		}, func(t *testing.T, xEnv *testenv.Environment) {})
+
+		require.NoError(t, err)
+	})
+
 	t.Run("verify invalid algorithm is detected for per subgraphs", func(t *testing.T) {
 		t.Parallel()
 

From a8a9cc2aa781438a667d8852b3f6e6f92b3e20df Mon Sep 17 00:00:00 2001
From: Milinda Dias <skay.archon@gmail.com>
Date: Mon, 1 Sep 2025 19:12:32 +0530
Subject: [PATCH 04/24] fix: tests

---
 router-tests/retry_test.go | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/router-tests/retry_test.go b/router-tests/retry_test.go
index 4ce9be0bca..9df5208fad 100644
--- a/router-tests/retry_test.go
+++ b/router-tests/retry_test.go
@@ -557,6 +557,35 @@ func TestRetryPerSubgraph(t *testing.T) {
 		require.NoError(t, err)
 	})
 
+	t.Run("invalid algorithm is ignored when retries are disabled (per subgraph)", func(t *testing.T) {
+		t.Parallel()
+
+		opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{
+			Subgraphs: map[string]config.GlobalSubgraphRequestRule{
+				"employees": {
+					BackoffJitterRetry: config.BackoffJitterRetry{
+						Enabled:     false,
+						Algorithm:   "invalid_algorithm",
+						MaxAttempts: 2,
+						MaxDuration: 2 * time.Second,
+						Interval:    10 * time.Millisecond,
+						Expression:  "true",
+					},
+				},
+			},
+		})
+		options := core.WithSubgraphRetryOptions(opts)
+
+		err := testenv.RunWithError(t, &testenv.Config{
+			NoRetryClient: true,
+			RouterOptions: []core.Option{
+				options,
+			},
+		}, func(t *testing.T, xEnv *testenv.Environment) {})
+
+		require.NoError(t, err)
+	})
+
 	t.Run("verify invalid algorithm is detected for per subgraphs", func(t *testing.T) {
 		t.Parallel()
 

From 79cdb75b3c2d3767e7ae7f062f834532e1ce5922 Mon Sep 17 00:00:00 2001
From: Milinda Dias <skay.archon@gmail.com>
Date: Mon, 1 Sep 2025 19:19:56 +0530
Subject: [PATCH 05/24] fix: add default options to subgraphs only when
 required

---
 router/internal/retrytransport/manager.go | 24 ++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/router/internal/retrytransport/manager.go b/router/internal/retrytransport/manager.go
index 8121c619da..01c0327f30 100644
--- a/router/internal/retrytransport/manager.go
+++ b/router/internal/retrytransport/manager.go
@@ -71,6 +71,7 @@ func (m *Manager) Initialize(
 		}
 	}
 
+	// First validate and add expressions for base retry options if needed
 	if len(defaultSgNames) > 0 && baseRetryOptions.Enabled {
 		if baseRetryOptions.Algorithm != BackoffJitter {
 			joinErr = errors.Join(joinErr, fmt.Errorf("unsupported retry algorithm: %s", baseRetryOptions.Algorithm))
@@ -78,14 +79,17 @@ func (m *Manager) Initialize(
 			err := m.exprManager.AddExpression(baseRetryOptions.Expression)
 			if err != nil {
 				joinErr = errors.Join(joinErr, fmt.Errorf("failed to add base retry expression: %w", err))
+			} else {
+				// Only assign default options if validation succeeds
+				for _, sgName := range defaultSgNames {
+					opts := baseRetryOptions
+					m.retries[sgName] = &opts
+				}
 			}
 		}
 	}
 
-	for _, sgName := range defaultSgNames {
-		m.retries[sgName] = &baseRetryOptions
-	}
-
+	// Process custom retry options
 	for _, sgName := range customSgNames {
 		entry, ok := subgraphRetryOptions[sgName]
 		if !ok {
@@ -94,18 +98,20 @@ func (m *Manager) Initialize(
 		}
 
 		if entry.Algorithm != BackoffJitter {
-			joinErr = errors.Join(joinErr, fmt.Errorf("unsupported retry algorithm: %s", baseRetryOptions.Algorithm))
+			joinErr = errors.Join(joinErr, fmt.Errorf("unsupported retry algorithm: %s", entry.Algorithm))
 			continue
 		}
 
-		opts := entry
-		m.retries[sgName] = &opts
-
+		// Validate expression before assigning options
 		err := m.exprManager.AddExpression(entry.Expression)
 		if err != nil {
-			joinErr = errors.Join(joinErr, errors.New("retry expression did not get added "+sgName))
+			joinErr = errors.Join(joinErr, fmt.Errorf("failed to add retry expression for subgraph %s: %w", sgName, err))
 			continue
 		}
+
+		// Create a new copy of the options
+		opts := entry
+		m.retries[sgName] = &opts
 	}
 
 	return joinErr

From 33446317078bde578480b27bd07dea0765a800fd Mon Sep 17 00:00:00 2001
From: Milinda Dias <skay.archon@gmail.com>
Date: Mon, 1 Sep 2025 19:34:38 +0530
Subject: [PATCH 06/24] fix: review comments

---
 router/internal/retrytransport/manager.go         | 7 -------
 router/internal/retrytransport/retry_transport.go | 8 +++++---
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/router/internal/retrytransport/manager.go b/router/internal/retrytransport/manager.go
index 01c0327f30..2cab10bae9 100644
--- a/router/internal/retrytransport/manager.go
+++ b/router/internal/retrytransport/manager.go
@@ -148,10 +148,3 @@ func (m *Manager) Retry(err error, req *http.Request, resp *http.Response, exprS
 	}
 	return m.retryFunc(err, req, resp, exprString)
 }
-
-// OnRetryHook triggers the configured OnRetry callback, if any.
-func (m *Manager) OnRetryHook(count int, err error, req *http.Request, resp *http.Response, sleepDuration time.Duration) {
-	if m.OnRetry != nil {
-		m.OnRetry(count, req, resp, sleepDuration, err)
-	}
-}
diff --git a/router/internal/retrytransport/retry_transport.go b/router/internal/retrytransport/retry_transport.go
index 8258e263d0..83cdae16c4 100644
--- a/router/internal/retrytransport/retry_transport.go
+++ b/router/internal/retrytransport/retry_transport.go
@@ -104,7 +104,7 @@ func (rt *RetryHTTPTransport) RoundTrip(req *http.Request) (*http.Response, erro
 	// If there is no option defined for this subgraph
 	retryOptions := rt.retryManager.GetSubgraphOptions(subgraph)
 	if retryOptions == nil {
-		return resp, nil
+		return resp, err
 	}
 
 	b := backoff.New(retryOptions.MaxDuration, retryOptions.Interval)
@@ -136,8 +136,10 @@ func (rt *RetryHTTPTransport) RoundTrip(req *http.Request) (*http.Response, erro
 			)
 		}
 
-		// Notify hook before sleeping
-		rt.retryManager.OnRetryHook(retries, err, req, resp, sleepDuration)
+		// A hook used for testing
+		if rt.retryManager.OnRetry != nil {
+			rt.retryManager.OnRetry(retries, req, resp, sleepDuration, err)
+		}
 
 		// Wait for the specified duration
 		time.Sleep(sleepDuration)

From e3cebe5f3c4a04880218fd16e1377e68bc17845b Mon Sep 17 00:00:00 2001
From: Milinda Dias <skay.archon@gmail.com>
Date: Tue, 2 Sep 2025 16:53:49 +0530
Subject: [PATCH 07/24] fix: cleanup formatting

---
 router-tests/telemetry/telemetry_test.go | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/router-tests/telemetry/telemetry_test.go b/router-tests/telemetry/telemetry_test.go
index 205128a295..8f4dd99c4d 100644
--- a/router-tests/telemetry/telemetry_test.go
+++ b/router-tests/telemetry/telemetry_test.go
@@ -826,6 +826,7 @@ func TestFlakyOperationCacheTelemetry(t *testing.T) {
 			metricdatatest.AssertEqual(t, maxCostMetrics, *integration.GetMetricByName(cacheScope, "router.graphql.cache.cost.max"), metricdatatest.IgnoreTimestamp())
 		})
 	})
+
 	t.Run("Validate operation cache telemetry for persisted and non persisted operations", func(t *testing.T) {
 		t.Parallel()
 		metricReader := metric.NewManualReader()
@@ -1203,6 +1204,7 @@ func TestFlakyOperationCacheTelemetry(t *testing.T) {
 			metricdatatest.AssertEqual(t, maxCostMetrics, *integration.GetMetricByName(cacheScope, "router.graphql.cache.cost.max"), metricdatatest.IgnoreTimestamp())
 		})
 	})
+
 	t.Run("Validate operation cache telemetry when prometheus is also enabled", func(t *testing.T) {
 		t.Parallel()
 		metricReader := metric.NewManualReader()
@@ -1544,6 +1546,7 @@ func TestFlakyOperationCacheTelemetry(t *testing.T) {
 			metricdatatest.AssertEqual(t, maxCostMetrics, *integration.GetMetricByName(cacheScope, "router.graphql.cache.cost.max"), metricdatatest.IgnoreTimestamp())
 		})
 	})
+
 	t.Run("Validate key and cost eviction metrics with small validation cache size without feature flags", func(t *testing.T) {
 		t.Parallel()
 		metricReader := metric.NewManualReader()
@@ -1887,6 +1890,7 @@ func TestFlakyOperationCacheTelemetry(t *testing.T) {
 			metricdatatest.AssertEqual(t, maxCostMetrics, *integration.GetMetricByName(cacheScope, "router.graphql.cache.cost.max"), metricdatatest.IgnoreTimestamp())
 		})
 	})
+
 	t.Run("Validate operation cache telemetry for default configuration including feature flags", func(t *testing.T) {
 		t.Parallel()
 		metricReader := metric.NewManualReader()
@@ -3558,6 +3562,7 @@ func TestFlakyTelemetry(t *testing.T) {
 					},
 				},
 			}
+
 			want := metricdata.ScopeMetrics{
 				Scope: instrumentation.Scope{
 					Name:      "cosmo.router",
@@ -4082,6 +4087,7 @@ func TestFlakyTelemetry(t *testing.T) {
 			require.Contains(t, sn[1].Attributes(), otel.WgEnginePersistedOperationCacheHit.Bool(true))
 		})
 	})
+
 	t.Run("Should exclude high cardinality attributes only from metrics if custom exporter is defined", func(t *testing.T) {
 		t.Parallel()
 
@@ -4729,6 +4735,7 @@ func TestFlakyTelemetry(t *testing.T) {
 			require.Contains(t, rm.Resource.Attributes(), attribute.String("service.name", "cosmo-router"))
 
 			require.Len(t, rm.ScopeMetrics, defaultExposedScopedMetricsCount)
+
 			scopeMetric := *integration.GetMetricScopeByName(rm.ScopeMetrics, "cosmo.router")
 			require.Len(t, scopeMetric.Metrics, defaultCosmoRouterMetricsCount)
 
@@ -5099,6 +5106,7 @@ func TestFlakyTelemetry(t *testing.T) {
 
 		})
 	})
+
 	t.Run("Should remap metric name to configured value", func(t *testing.T) {
 		t.Parallel()
 
@@ -5439,6 +5447,7 @@ func TestFlakyTelemetry(t *testing.T) {
 			metricdatatest.AssertEqual(t, routerInfoMetric, scopeMetric.Metrics[6], metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue())
 		})
 	})
+
 	t.Run("Custom span and resource attributes are attached to all metrics and spans / from header", func(t *testing.T) {
 		t.Parallel()
 
@@ -5870,6 +5879,7 @@ func TestFlakyTelemetry(t *testing.T) {
 			metricdatatest.AssertEqual(t, routerInfoMetric, scopeMetric.Metrics[6], metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue())
 		})
 	})
+
 	t.Run("Custom span and resource attributes are attached to all metrics and spans / static", func(t *testing.T) {
 		t.Parallel()
 
@@ -6299,6 +6309,7 @@ func TestFlakyTelemetry(t *testing.T) {
 			metricdatatest.AssertEqual(t, routerInfoMetric, scopeMetric.Metrics[6], metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue())
 		})
 	})
+
 	t.Run("Requesting a feature flags will emit different router config version and add the feature flag attribute", func(t *testing.T) {
 		t.Parallel()
 
@@ -6939,6 +6950,7 @@ func TestFlakyTelemetry(t *testing.T) {
 			require.Equal(t, sdktrace.Status{Code: codes.Unset}, sn[8].Status())
 		})
 	})
+
 	t.Run("Origin connectivity issue is traced", func(t *testing.T) {
 		t.Parallel()
 
@@ -7520,6 +7532,7 @@ func TestFlakyTelemetry(t *testing.T) {
 			integration.AssertAttributeNotInSet(t, resClFiltered.DataPoints[1].Attributes, otel.WgOperationName.String(""))
 		})
 	})
+
 	t.Run("Custom Metric Attributes", func(t *testing.T) {
 		t.Parallel()
 
@@ -8091,6 +8104,7 @@ func TestFlakyTelemetry(t *testing.T) {
 				metricdatatest.AssertEqual(t, want, scopeMetric, metricdatatest.IgnoreTimestamp(), metricdatatest.IgnoreValue())
 			})
 		})
+
 		t.Run("should not remap high cardinality fields when using cloud exporter but include custom metric attributes", func(t *testing.T) {
 			t.Parallel()
 
@@ -8732,6 +8746,7 @@ func TestFlakyTelemetry(t *testing.T) {
 				require.True(t, found, "expected to find a datapoint with subgraph name and id in the metrics")
 			})
 		})
+
 		t.Run("Tracing is not affected by custom metric attributes", func(t *testing.T) {
 			t.Parallel()
 
@@ -9176,6 +9191,7 @@ func TestFlakyTelemetry(t *testing.T) {
 			assert.ErrorAs(t, err, &expectedErr)
 		})
 	})
+
 	t.Run("custom trace metrics with expression", func(t *testing.T) {
 		t.Parallel()
 
@@ -9580,6 +9596,7 @@ func TestFlakyTelemetry(t *testing.T) {
 			})
 		})
 	})
+
 	t.Run("verify attribute expressions with subgraph in the expression", func(t *testing.T) {
 		t.Run("verify subgraph expression should only be present for engine fetch", func(t *testing.T) {
 			t.Parallel()
@@ -9637,8 +9654,8 @@ func TestFlakyTelemetry(t *testing.T) {
 					require.IsType(t, metricdata.Sum[int64]{}, httpRequestsMetric.Data)
 
 					data2 := httpRequestsMetric.Data.(metricdata.Sum[int64])
-					atts := data2.DataPoints[0].Attributes
-					_, ok := atts.Value(attribute.Key(key))
+					attrs := data2.DataPoints[0].Attributes
+					_, ok := attrs.Value(attribute.Key(key))
 					require.False(t, ok)
 				})
 			})
@@ -10191,6 +10208,7 @@ func TestExcludeAttributesWithCustomExporter(t *testing.T) {
 					metricdatatest.AssertEqual(t, connectionMetrics, *integration.GetMetricByName(engineScope, "router.engine.connections"), metricdatatest.IgnoreTimestamp())
 				})
 			})
+
 			t.Run(fmt.Sprintf("cache metrics for %s", usingCustomExporter), func(t *testing.T) {
 				t.Parallel()
 				metricReader := metric.NewManualReader()

From c6c224bdb1b44287f09658f4922b7907b284939e Mon Sep 17 00:00:00 2001
From: Milinda Dias <skay.archon@gmail.com>
Date: Wed, 3 Sep 2025 17:49:14 +0530
Subject: [PATCH 08/24] fix: return error

---
 router/internal/retrytransport/manager.go | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/router/internal/retrytransport/manager.go b/router/internal/retrytransport/manager.go
index 2cab10bae9..3a0a2d9fe6 100644
--- a/router/internal/retrytransport/manager.go
+++ b/router/internal/retrytransport/manager.go
@@ -1,7 +1,6 @@
 package retrytransport
 
 import (
-	"errors"
 	"fmt"
 	"net/http"
 	"sync"
@@ -56,8 +55,6 @@ func (m *Manager) Initialize(
 	m.lock.Lock()
 	defer m.lock.Unlock()
 
-	var joinErr error
-
 	defaultSgNames := make([]string, 0, len(subgraphs))
 	customSgNames := make([]string, 0, len(subgraphs))
 
@@ -74,11 +71,11 @@ func (m *Manager) Initialize(
 	// First validate and add expressions for base retry options if needed
 	if len(defaultSgNames) > 0 && baseRetryOptions.Enabled {
 		if baseRetryOptions.Algorithm != BackoffJitter {
-			joinErr = errors.Join(joinErr, fmt.Errorf("unsupported retry algorithm: %s", baseRetryOptions.Algorithm))
+			return fmt.Errorf("unsupported retry algorithm: %s", baseRetryOptions.Algorithm)
 		} else {
 			err := m.exprManager.AddExpression(baseRetryOptions.Expression)
 			if err != nil {
-				joinErr = errors.Join(joinErr, fmt.Errorf("failed to add base retry expression: %w", err))
+				return fmt.Errorf("failed to add base retry expression: %w", err)
 			} else {
 				// Only assign default options if validation succeeds
 				for _, sgName := range defaultSgNames {
@@ -93,20 +90,17 @@ func (m *Manager) Initialize(
 	for _, sgName := range customSgNames {
 		entry, ok := subgraphRetryOptions[sgName]
 		if !ok {
-			joinErr = errors.Join(joinErr, errors.New("retry config not found for subgraph "+sgName))
-			continue
+			return fmt.Errorf("failed to add base retry expression: %s", sgName)
 		}
 
 		if entry.Algorithm != BackoffJitter {
-			joinErr = errors.Join(joinErr, fmt.Errorf("unsupported retry algorithm: %s", entry.Algorithm))
-			continue
+			return fmt.Errorf("unsupported retry algorithm for subgraph %s: %s", sgName, baseRetryOptions.Algorithm)
 		}
 
 		// Validate expression before assigning options
 		err := m.exprManager.AddExpression(entry.Expression)
 		if err != nil {
-			joinErr = errors.Join(joinErr, fmt.Errorf("failed to add retry expression for subgraph %s: %w", sgName, err))
-			continue
+			return fmt.Errorf("failed to add retry expression for subgraph %s: %w", sgName, err)
 		}
 
 		// Create a new copy of the options
@@ -114,7 +108,7 @@ func (m *Manager) Initialize(
 		m.retries[sgName] = &opts
 	}
 
-	return joinErr
+	return nil
 }
 
 func (m *Manager) GetSubgraphOptions(name string) *RetryOptions {

From dae502d75dbafc117d49932fccb94ab4e336c277 Mon Sep 17 00:00:00 2001
From: Milinda Dias <skay.archon@gmail.com>
Date: Wed, 3 Sep 2025 18:38:26 +0530
Subject: [PATCH 09/24] fix: review comments

---
 router-tests/retry_test.go        | 159 +++++++++++++++++++++++++++++-
 router/core/retry_builder_test.go |  44 ---------
 2 files changed, 156 insertions(+), 47 deletions(-)

diff --git a/router-tests/retry_test.go b/router-tests/retry_test.go
index 9df5208fad..4e5fc02e6e 100644
--- a/router-tests/retry_test.go
+++ b/router-tests/retry_test.go
@@ -589,7 +589,6 @@ func TestRetryPerSubgraph(t *testing.T) {
 	t.Run("verify invalid algorithm is detected for per subgraphs", func(t *testing.T) {
 		t.Parallel()
 
-		// Configure per-subgraph retry: employees gets 3 retries, test1 gets 1 retry
 		opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{
 			Subgraphs: map[string]config.GlobalSubgraphRequestRule{
 				"employees": {
@@ -618,6 +617,94 @@ func TestRetryPerSubgraph(t *testing.T) {
 		require.ErrorContains(t, err, "unsupported retry algorithm")
 	})
 
+	t.Run("verify invalid expression is detected for base", func(t *testing.T) {
+		t.Parallel()
+
+		opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{
+			All: config.GlobalSubgraphRequestRule{
+				BackoffJitterRetry: config.BackoffJitterRetry{
+					Enabled:     true,
+					MaxAttempts: 2,
+					MaxDuration: 2 * time.Second,
+					Interval:    10 * time.Millisecond,
+					Expression:  "truethere",
+				},
+			},
+		})
+		options := core.WithSubgraphRetryOptions(opts)
+
+		err := testenv.RunWithError(t, &testenv.Config{
+			NoRetryClient: true,
+			RouterOptions: []core.Option{
+				options,
+			},
+		}, func(t *testing.T, xEnv *testenv.Environment) {
+			require.Fail(t, "expected initialization to fail due to invalid algorithm")
+		})
+
+		require.ErrorContains(t, err, "failed to add base retry expression: failed to compile retry expression: line 1, column 0: unknown name truethere")
+	})
+
+	t.Run("verify invalid expression is detected per subgraphs", func(t *testing.T) {
+		t.Parallel()
+
+		opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{
+			Subgraphs: map[string]config.GlobalSubgraphRequestRule{
+				"employees": {
+					BackoffJitterRetry: config.BackoffJitterRetry{
+						Enabled:     true,
+						MaxAttempts: 2,
+						MaxDuration: 2 * time.Second,
+						Interval:    10 * time.Millisecond,
+						Expression:  "truethere",
+					},
+				},
+			},
+		})
+		options := core.WithSubgraphRetryOptions(opts)
+
+		err := testenv.RunWithError(t, &testenv.Config{
+			NoRetryClient: true,
+			RouterOptions: []core.Option{
+				options,
+			},
+		}, func(t *testing.T, xEnv *testenv.Environment) {
+			require.Fail(t, "expected initialization to fail due to invalid algorithm")
+		})
+
+		require.ErrorContains(t, err, "failed to add retry expression for subgraph employees: failed to compile retry expression: line 1, column 0: unknown name truethere")
+	})
+
+	t.Run("verify valid expression and algorithm", func(t *testing.T) {
+		t.Parallel()
+
+		opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{
+			Subgraphs: map[string]config.GlobalSubgraphRequestRule{
+				"employees": {
+					BackoffJitterRetry: config.BackoffJitterRetry{
+						Enabled:     true,
+						Algorithm:   "backoff_jitter",
+						MaxAttempts: 2,
+						MaxDuration: 2 * time.Second,
+						Interval:    10 * time.Millisecond,
+						Expression:  "true",
+					},
+				},
+			},
+		})
+		options := core.WithSubgraphRetryOptions(opts)
+
+		err := testenv.RunWithError(t, &testenv.Config{
+			NoRetryClient: true,
+			RouterOptions: []core.Option{
+				options,
+			},
+		}, func(t *testing.T, xEnv *testenv.Environment) {
+		})
+
+		require.NoError(t, err)
+	})
+
 	t.Run("verify retries are applied per subgraph", func(t *testing.T) {
 		t.Parallel()
 
@@ -690,7 +777,7 @@ func TestRetryPerSubgraph(t *testing.T) {
 			})
 
 			require.NoError(t, err)
-			require.Contains(t, resTest1.Body, `"statusCode":502`)
+			require.Equal(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'test1', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"floatField":null}}`, resTest1.Body)
 			require.Equal(t, int32(employeesMax+1), employeesCalls.Load())
 			require.Equal(t, int32(test1Max+1), test1Calls.Load())
 		})
@@ -768,9 +855,75 @@ func TestRetryPerSubgraph(t *testing.T) {
 			})
 
 			require.NoError(t, err)
-			require.Contains(t, resTest1.Body, `"statusCode":502`)
+			require.Equal(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'test1', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"floatField":null}}`, resTest1.Body)
 			require.Equal(t, int32(employeesMax+1), employeesCalls.Load())
 			require.Equal(t, int32(test1Max+1), test1Calls.Load())
 		})
 	})
+
+	t.Run("verify retries are applied when only all and no subgraph specific overrides are present", func(t *testing.T) {
+		t.Parallel()
+
+		employeesCalls := atomic.Int32{}
+		test1Calls := atomic.Int32{}
+
+		// Configure per-subgraph retry: employees gets 3 retries, test1 gets 1 retry
+		generalMax := 3
+		opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{
+			All: config.GlobalSubgraphRequestRule{
+				BackoffJitterRetry: config.BackoffJitterRetry{
+					Enabled:     true,
+					MaxAttempts: generalMax,
+					MaxDuration: 2 * time.Second,
+					Interval:    10 * time.Millisecond,
+					Expression:  "true",
+				},
+			},
+		})
+		options := core.WithSubgraphRetryOptions(opts)
+
+		testenv.Run(t, &testenv.Config{
+			NoRetryClient: true,
+			RouterOptions: []core.Option{
+				options,
+			},
+			Subgraphs: testenv.SubgraphsConfig{
+				Employees: testenv.SubgraphConfig{
+					Middleware: func(_ http.Handler) http.Handler {
+						return http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+							w.WriteHeader(http.StatusBadGateway)
+							employeesCalls.Add(1)
+						})
+					},
+				},
+				Test1: testenv.SubgraphConfig{
+					Middleware: func(handler http.Handler) http.Handler {
+						return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+							w.WriteHeader(http.StatusBadGateway)
+							test1Calls.Add(1)
+						})
+					},
+				},
+			},
+		}, func(t *testing.T, xEnv *testenv.Environment) {
+			// 1) Call employees-only query; expect employees subgraph to be retried generalMax times (attempts = retries + 1)
+			resEmp, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{
+				Query: `query employees { employees { id } }`,
+			})
+			require.NoError(t, err)
+			require.Equal(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'employees', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"employees":null}}`, resEmp.Body)
+			require.Equal(t, int32(generalMax+1), employeesCalls.Load())
+			require.Equal(t, int32(0), test1Calls.Load())
+
+			// 2) Call test1-only query; expect test subgraph to be retried test1Max times (attempts = retries + 1)
+			resTest1, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{
+				Query: `query { floatField(arg: 1.5) }`,
+			})
+
+			require.NoError(t, err)
+			require.Equal(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'test1', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"floatField":null}}`, resTest1.Body)
+			require.Equal(t, int32(generalMax+1), employeesCalls.Load())
+			require.Equal(t, int32(generalMax+1), test1Calls.Load())
+		})
+	})
 }
diff --git a/router/core/retry_builder_test.go b/router/core/retry_builder_test.go
index 6a2d34cdab..7bfeda9fcd 100644
--- a/router/core/retry_builder_test.go
+++ b/router/core/retry_builder_test.go
@@ -438,47 +438,3 @@ func TestBuildRetryFunction(t *testing.T) {
 		assert.False(t, fn(errors.New("connection refused"), mutationReq, nil, expression))
 	})
 }
-
-// TODO: Fix
-//func TestProcessRetryOptions(t *testing.T) {
-//	t.Run("process invalid algorithm", func(t *testing.T) {
-//		algorithm := "abcdee"
-//		_, err := ProcessRetryOptions(retrytransport.RetryOptions{
-//			Enabled:   true,
-//			Algorithm: algorithm,
-//		})
-//
-//		expectedError := fmt.Sprintf("unsupported retry algorithm: %s", algorithm)
-//		assert.ErrorContains(t, err, expectedError)
-//	})
-//
-//	t.Run("process invalid algorithm when retries are disabled", func(t *testing.T) {
-//		algorithm := "abcdee"
-//		_, err := ProcessRetryOptions(retrytransport.RetryOptions{
-//			Enabled:   false,
-//			Algorithm: algorithm,
-//		})
-//		assert.NoError(t, err)
-//	})
-//
-//	t.Run("process invalid expression", func(t *testing.T) {
-//		_, err := ProcessRetryOptions(retrytransport.RetryOptions{
-//			Enabled:    true,
-//			Algorithm:  "backoff_jitter",
-//			Expression: "invalid syntax +++",
-//		})
-//
-//		assert.ErrorContains(t, err, "failed to build retry function")
-//	})
-//
-//	t.Run("process valid options", func(t *testing.T) {
-//		options := retrytransport.RetryOptions{
-//			Enabled:    true,
-//			Algorithm:  "backoff_jitter",
-//			Expression: "statusCode == 500 || IsTimeout() || IsConnectionError()",
-//		}
-//		response, err := ProcessRetryOptions(options)
-//		assert.NoError(t, err)
-//		assert.NotSame(t, &options, response)
-//	})
-//}

From 3c7647c333cb9ba703b6ddfa88fab3e6ffee1df2 Mon Sep 17 00:00:00 2001
From: Milinda Dias <skay.archon@gmail.com>
Date: Wed, 3 Sep 2025 18:58:24 +0530
Subject: [PATCH 10/24] fix: tests

---
 router-tests/retry_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/router-tests/retry_test.go b/router-tests/retry_test.go
index 4e5fc02e6e..5afd96c3a6 100644
--- a/router-tests/retry_test.go
+++ b/router-tests/retry_test.go
@@ -472,8 +472,8 @@ func TestFlakyRetry(t *testing.T) {
 					Employees: testenv.SubgraphConfig{
 						Middleware: func(_ http.Handler) http.Handler {
 							return http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
-								w.WriteHeader(http.StatusTooManyRequests)
 								w.Header().Set("Retry-After", strconv.Itoa(emptyRetryInterval))
+								w.WriteHeader(http.StatusTooManyRequests)
 								serviceCallsCounter.Add(1)
 							})
 						},

From c7b4040ed3dc248a2965e4ac6893318aeaa8c891 Mon Sep 17 00:00:00 2001
From: Milinda Dias <skay.archon@gmail.com>
Date: Wed, 3 Sep 2025 23:47:46 +0530
Subject: [PATCH 11/24] fix: review comments

---
 router/core/router.go                     | 2 +-
 router/internal/retrytransport/manager.go | 7 ++-----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/router/core/router.go b/router/core/router.go
index afccb42fbe..1b2ab01ffb 100644
--- a/router/core/router.go
+++ b/router/core/router.go
@@ -1913,7 +1913,7 @@ func NewSubgraphRetryOptions(cfg config.TrafficShapingRules) *SubgraphRetryOptio
 	if cfg.All.BackoffJitterRetry.Enabled {
 		entry.All = newRetryConfig(cfg.All.BackoffJitterRetry)
 	}
-	// Subgraph specific circuit breakers
+	// Subgraph specific retry configs
 	for k, v := range cfg.Subgraphs {
 		entry.SubgraphMap[k] = newRetryConfig(v.BackoffJitterRetry)
 	}
diff --git a/router/internal/retrytransport/manager.go b/router/internal/retrytransport/manager.go
index 3a0a2d9fe6..1c7fdff1ad 100644
--- a/router/internal/retrytransport/manager.go
+++ b/router/internal/retrytransport/manager.go
@@ -52,9 +52,6 @@ func (m *Manager) Initialize(
 	subgraphRetryOptions map[string]RetryOptions,
 	subgraphs []*nodev1.Subgraph,
 ) error {
-	m.lock.Lock()
-	defer m.lock.Unlock()
-
 	defaultSgNames := make([]string, 0, len(subgraphs))
 	customSgNames := make([]string, 0, len(subgraphs))
 
@@ -119,8 +116,8 @@ func (m *Manager) GetSubgraphOptions(name string) *RetryOptions {
 	m.lock.RLock()
 	defer m.lock.RUnlock()
 
-	if circuitBreaker, ok := m.retries[name]; ok {
-		return circuitBreaker
+	if retryOptions, ok := m.retries[name]; ok {
+		return retryOptions
 	}
 	return nil
 }

From a976f1614a37dc5ea7a8128cf86e13961c7c4e54 Mon Sep 17 00:00:00 2001
From: Milinda Dias <skay.archon@gmail.com>
Date: Tue, 16 Sep 2025 21:38:18 +0530
Subject: [PATCH 12/24] fix: merge resolving

---
 router/core/router_config.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/router/core/router_config.go b/router/core/router_config.go
index 6b95ba8335..62ae89bd65 100644
--- a/router/core/router_config.go
+++ b/router/core/router_config.go
@@ -8,7 +8,7 @@ import (
 	nodev1 "github.com/wundergraph/cosmo/router/gen/proto/wg/cosmo/node/v1"
 	"github.com/wundergraph/cosmo/router/internal/graphqlmetrics"
 	"github.com/wundergraph/cosmo/router/internal/persistedoperation"
-	rd "github.com/wundergraph/cosmo/router/internal/persistedoperation/operationstorage/redis"
+	rd "github.com/wundergraph/cosmo/router/internal/rediscloser"
 	"github.com/wundergraph/cosmo/router/pkg/config"
 	"github.com/wundergraph/cosmo/router/pkg/controlplane/configpoller"
 	"github.com/wundergraph/cosmo/router/pkg/controlplane/selfregister"

From f4090fa1b45ffd7e4207c95595a65da314f94af4 Mon Sep 17 00:00:00 2001
From: Milinda Dias <skay.archon@gmail.com>
Date: Tue, 16 Sep 2025 21:38:27 +0530
Subject: [PATCH 13/24] fix: upgrade request

---
 router-tests/circuit_breaker_test.go          | 134 ++++
 router-tests/retry_test.go                    | 102 +++
 router-tests/websocket_test.go                |  13 +
 router/core/engine_loader_hooks.go            |   2 -
 router/core/transport.go                      |  32 +-
 router/internal/circuit/breaker.go            |  27 +-
 router/internal/context/keys.go               |   2 -
 .../retrytransport/retry_transport.go         |  33 +-
 .../retrytransport/retry_transport_test.go    | 728 ++++++++----------
 router/internal/traceclient/traceclient.go    |  33 +-
 10 files changed, 646 insertions(+), 460 deletions(-)

diff --git a/router-tests/circuit_breaker_test.go b/router-tests/circuit_breaker_test.go
index 7e7f1b60eb..2ee4ef21c0 100644
--- a/router-tests/circuit_breaker_test.go
+++ b/router-tests/circuit_breaker_test.go
@@ -2,6 +2,7 @@ package integration
 
 import (
 	"context"
+	"github.com/gorilla/websocket"
 	"net/http"
 	"sort"
 	"sync/atomic"
@@ -623,6 +624,139 @@ func TestCircuitBreaker(t *testing.T) {
 		})
 	})
 
+	t.Run("verify circuit breaker tripping on upgrade requests", func(t *testing.T) {
+		t.Parallel()
+
+		const failedTries int64 = 3
+
+		const timestampMessage = `{"type":"next","id":"1","payload":{"data":{"currentTime":{"unixTime":1,"timeStamp":"2021-09-01T12:00:00Z"}}}}`
+		const completeMessage = `{"type":"complete","id":"1"}`
+		const defaultErrorMessage = `{"id":"1","type":"error","payload":[{"message":"Internal server error"}]}`
+
+		breaker := getCircuitBreakerWithDefaults()
+		breaker.RequestThreshold = 3
+		breaker.ErrorThresholdPercentage = 100
+
+		breaker.NumBuckets = 1
+		breaker.RollingDuration = 5000 * time.Millisecond
+
+		trafficConfig := getTrafficConfigWithTimeout(breaker, 1*time.Second)
+
+		employeesCalls := atomic.Int64{}
+
+		testenv.Run(t, &testenv.Config{
+			ModifyEngineExecutionConfiguration: func(engineExecutionConfiguration *config.EngineExecutionConfiguration) {
+				engineExecutionConfiguration.WebSocketClientReadTimeout = time.Millisecond * 2000
+			},
+			LogObservation: testenv.LogObservationConfig{
+				Enabled:  true,
+				LogLevel: zapcore.DebugLevel,
+			},
+			RouterOptions: []core.Option{
+				core.WithSubgraphCircuitBreakerOptions(core.NewSubgraphCircuitBreakerOptions(trafficConfig)),
+				core.WithSubgraphTransportOptions(core.NewSubgraphTransportOptions(trafficConfig)),
+			},
+			Subgraphs: testenv.SubgraphsConfig{
+				Employees: testenv.SubgraphConfig{
+					Middleware: func(handler http.Handler) http.Handler {
+						return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+							employeesCalls.Add(1)
+							if employeesCalls.Load() <= failedTries {
+								simulateConnectionFailureOnClose(w)
+								return
+							}
+
+							upgrader := websocket.Upgrader{
+								CheckOrigin: func(r *http.Request) bool {
+									return true
+								},
+								Subprotocols: []string{"graphql-transport-ws"},
+							}
+							conn, err := upgrader.Upgrade(w, r, nil)
+							require.NoError(t, err)
+							defer func() {
+								_ = conn.Close()
+							}()
+
+							_, _, err = testenv.WSReadMessage(t, conn)
+							require.NoError(t, err)
+
+							err = testenv.WSWriteMessage(t, conn, websocket.TextMessage, []byte(`{"type":"connection_ack"}`))
+							require.NoError(t, err)
+
+							err = testenv.WSWriteMessage(t, conn, websocket.TextMessage, []byte(timestampMessage))
+							require.NoError(t, err)
+
+							err = testenv.WSWriteMessage(t, conn, websocket.TextMessage, []byte(completeMessage))
+							require.NoError(t, err)
+						})
+					},
+				},
+			},
+		}, func(t *testing.T, xEnv *testenv.Environment) {
+			for i := range failedTries + 2 {
+				conn := xEnv.InitGraphQLWebSocketConnection(nil, nil, nil)
+				err := testenv.WSWriteJSON(t, conn, &testenv.WebSocketMessage{
+					ID:      "1",
+					Type:    "subscribe",
+					Payload: []byte(`{"query":"subscription { currentTime { unixTime timeStamp }}"}`),
+				})
+				require.NoError(t, err)
+
+				_, message, err := testenv.WSReadMessage(t, conn)
+				require.NoError(t, err)
+
+				require.Equal(t, defaultErrorMessage, string(message))
+				require.NoError(t, conn.Close())
+
+				switch {
+				case i < breaker.RequestThreshold-1:
+					require.Zero(t, xEnv.Observer().FilterMessage("Circuit breaker status changed").Len())
+				case i == breaker.RequestThreshold-1:
+					require.Equal(t, 1, xEnv.Observer().FilterMessage("Circuit breaker status changed").Len())
+				case i > breaker.RequestThreshold-1:
+					expectedCount := i - (breaker.RequestThreshold - 1)
+					require.Equal(t, int(expectedCount), xEnv.Observer().FilterMessage("Circuit breaker open, request callback did not execute").Len())
+				}
+
+			}
+
+			require.Equal(t, failedTries, employeesCalls.Load())
+
+			// Wait for current bucket to be cleaned up
+			time.Sleep(breaker.RollingDuration + time.Millisecond*500)
+
+			// ====
+			// Verify a success case with messages validated from here onwards
+			// ====
+
+			// Sending a complete must stop the subscription
+			conn := xEnv.InitGraphQLWebSocketConnection(nil, nil, nil)
+			err := testenv.WSWriteJSON(t, conn, &testenv.WebSocketMessage{
+				ID:      "1",
+				Type:    "subscribe",
+				Payload: []byte(`{"query":"subscription { currentTime { unixTime timeStamp }}"}`),
+			})
+			require.NoError(t, err)
+
+			_, message, err := testenv.WSReadMessage(t, conn)
+			require.NoError(t, err)
+			require.JSONEq(t, timestampMessage, string(message))
+
+			err = testenv.WSWriteJSON(t, conn, &testenv.WebSocketMessage{ID: "1", Type: "complete"})
+			require.NoError(t, err)
+
+			err = conn.SetReadDeadline(time.Now().Add(2 * time.Second))
+			require.NoError(t, err)
+
+			_, actualCompleteMessage, err := testenv.WSReadMessage(t, conn)
+			require.NoError(t, err)
+			require.JSONEq(t, completeMessage, string(actualCompleteMessage))
+
+			xEnv.WaitForSubscriptionCount(0, time.Second*5)
+		})
+	})
+
 	t.Run("circuit breaker metrics", func(t *testing.T) {
 		t.Parallel()
 
diff --git a/router-tests/retry_test.go b/router-tests/retry_test.go
index 5afd96c3a6..d52c3ef1ab 100644
--- a/router-tests/retry_test.go
+++ b/router-tests/retry_test.go
@@ -1,6 +1,7 @@
 package integration
 
 import (
+	"github.com/gorilla/websocket"
 	"net/http"
 	"strconv"
 	"sync/atomic"
@@ -926,4 +927,105 @@ func TestRetryPerSubgraph(t *testing.T) {
 			require.Equal(t, int32(generalMax+1), test1Calls.Load())
 		})
 	})
+
+	t.Run("verify retry on upgrade requests", func(t *testing.T) {
+		t.Parallel()
+
+		const failedTries int64 = 2
+
+		const timestampMessage = `{"type":"next","id":"1","payload":{"data":{"currentTime":{"unixTime":1,"timeStamp":"2021-09-01T12:00:00Z"}}}}`
+		const completeMessage = `{"type":"complete","id":"1"}`
+
+		employeesCalls := atomic.Int64{}
+
+		testenv.Run(t, &testenv.Config{
+			ModifyEngineExecutionConfiguration: func(engineExecutionConfiguration *config.EngineExecutionConfiguration) {
+				engineExecutionConfiguration.WebSocketClientReadTimeout = time.Millisecond * 2000
+			},
+
+			RouterOptions: []core.Option{
+				core.WithSubgraphRetryOptions(core.NewSubgraphRetryOptions(config.TrafficShapingRules{
+					All: config.GlobalSubgraphRequestRule{
+						BackoffJitterRetry: config.BackoffJitterRetry{
+							Enabled:     true,
+							MaxAttempts: 5,
+							MaxDuration: 10 * time.Second,
+							Interval:    200 * time.Millisecond,
+							Expression:  "IsRetryableStatusCode() || IsConnectionError() || IsTimeout()",
+						},
+					},
+				})),
+			},
+			Subgraphs: testenv.SubgraphsConfig{
+				Employees: testenv.SubgraphConfig{
+					Middleware: func(handler http.Handler) http.Handler {
+						return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+							employeesCalls.Add(1)
+							if employeesCalls.Load() <= failedTries {
+								w.WriteHeader(http.StatusBadGateway)
+								return
+							}
+
+							upgrader := websocket.Upgrader{
+								CheckOrigin: func(r *http.Request) bool {
+									return true
+								},
+								Subprotocols: []string{"graphql-transport-ws"},
+							}
+							conn, err := upgrader.Upgrade(w, r, nil)
+							require.NoError(t, err)
+							defer func() {
+								_ = conn.Close()
+							}()
+
+							_, _, err = testenv.WSReadMessage(t, conn)
+							require.NoError(t, err)
+
+							err = testenv.WSWriteMessage(t, conn, websocket.TextMessage, []byte(`{"type":"connection_ack"}`))
+							require.NoError(t, err)
+
+							err = testenv.WSWriteMessage(t, conn, websocket.TextMessage, []byte(timestampMessage))
+							require.NoError(t, err)
+
+							err = testenv.WSWriteMessage(t, conn, websocket.TextMessage, []byte(completeMessage))
+							require.NoError(t, err)
+						})
+					},
+				},
+			},
+		}, func(t *testing.T, xEnv *testenv.Environment) {
+			conn := xEnv.InitGraphQLWebSocketConnection(nil, nil, nil)
+			err := testenv.WSWriteJSON(t, conn, &testenv.WebSocketMessage{
+				ID:      "1",
+				Type:    "subscribe",
+				Payload: []byte(`{"query":"subscription { currentTime { unixTime timeStamp }}"}`),
+			})
+			require.NoError(t, err)
+
+			// Read a result and store its timestamp, next result should be 1 second later
+			_, messageBytes, err := testenv.WSReadMessage(t, conn)
+			require.NoError(t, err)
+
+			require.Equal(t, failedTries+1, employeesCalls.Load())
+
+			// ====
+			// Verify the messages are correct from here onwards
+			// ====
+			require.JSONEq(t, timestampMessage, string(messageBytes))
+
+			// Sending a complete must stop the subscription
+			err = testenv.WSWriteJSON(t, conn, &testenv.WebSocketMessage{ID: "1", Type: "complete"})
+			require.NoError(t, err)
+
+			err = conn.SetReadDeadline(time.Now().Add(2 * time.Second))
+			require.NoError(t, err)
+
+			_, actualCompleteMessage, err := testenv.WSReadMessage(t, conn)
+			require.NoError(t, err)
+			require.JSONEq(t, completeMessage, string(actualCompleteMessage))
+
+			require.NoError(t, conn.Close())
+			xEnv.WaitForSubscriptionCount(0, time.Second*5)
+		})
+	})
 }
diff --git a/router-tests/websocket_test.go b/router-tests/websocket_test.go
index 5eb16bacb5..7386cce290 100644
--- a/router-tests/websocket_test.go
+++ b/router-tests/websocket_test.go
@@ -706,6 +706,18 @@ func TestWebSockets(t *testing.T) {
 	t.Run("empty allow lists should allow all headers and query args", func(t *testing.T) {
 		t.Parallel()
 
+		opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{
+			All: config.GlobalSubgraphRequestRule{
+				BackoffJitterRetry: config.BackoffJitterRetry{
+					Enabled:     true,
+					MaxAttempts: 5,
+					MaxDuration: 10 * time.Second,
+					Interval:    200 * time.Millisecond,
+					Expression:  "true",
+				},
+			},
+		})
+
 		headerRules := config.HeaderRules{
 			All: &config.GlobalHeaderRule{
 				Request: []*config.RequestHeaderRule{
@@ -734,6 +746,7 @@ func TestWebSockets(t *testing.T) {
 			},
 			RouterOptions: []core.Option{
 				core.WithHeaderRules(headerRules),
+				core.WithSubgraphRetryOptions(opts),
 			},
 			Subgraphs: testenv.SubgraphsConfig{
 				Employees: testenv.SubgraphConfig{
diff --git a/router/core/engine_loader_hooks.go b/router/core/engine_loader_hooks.go
index e3da6fe26e..579553e6f8 100644
--- a/router/core/engine_loader_hooks.go
+++ b/router/core/engine_loader_hooks.go
@@ -92,8 +92,6 @@ func (f *engineLoaderHooks) OnLoad(ctx context.Context, ds resolve.DataSourceInf
 
 	start := time.Now()
 
-	ctx = context.WithValue(ctx, rcontext.CurrentSubgraphContextKey{}, ds.Name)
-
 	duration := atomic.Int64{}
 	ctx = context.WithValue(ctx, rcontext.FetchTimingKey, &duration)
 
diff --git a/router/core/transport.go b/router/core/transport.go
index f16533210f..2211936ba6 100644
--- a/router/core/transport.go
+++ b/router/core/transport.go
@@ -84,31 +84,41 @@ func NewCustomTransport(
 	// As a workaround we pass in a function that can be used to get the logger from within the round tripper
 	getRequestContextLogger := func(req *http.Request) *zap.Logger {
 		reqContext := getRequestContext(req.Context())
+		if reqContext == nil {
+			return zap.NewNop()
+		}
 		return reqContext.Logger()
 	}
 
+	getActiveSubgraphName := func(req *http.Request) string {
+		reqContext := getRequestContext(req.Context())
+		if reqContext == nil {
+			return ""
+		}
+		subgraph := reqContext.ActiveSubgraph(req)
+		if subgraph != nil {
+			return subgraph.Name
+		}
+		return ""
+	}
+
 	if enableTraceClient {
-		getValuesFromRequest := func(ctx context.Context, req *http.Request) (*expr.Context, string) {
+		getExprContext := func(ctx context.Context, req *http.Request) *expr.Context {
 			reqContext := getRequestContext(ctx)
 			if reqContext == nil {
-				return &expr.Context{}, ""
-			}
-
-			var activeSubgraphName string
-			if activeSubgraph := reqContext.ActiveSubgraph(req); activeSubgraph != nil {
-				activeSubgraphName = activeSubgraph.Name
+				return &expr.Context{}
 			}
-			return &reqContext.expressionContext, activeSubgraphName
+			return &reqContext.expressionContext
 		}
-		baseRoundTripper = traceclient.NewTraceInjectingRoundTripper(baseRoundTripper, connectionMetricStore, getValuesFromRequest)
+		baseRoundTripper = traceclient.NewTraceInjectingRoundTripper(baseRoundTripper, connectionMetricStore, getExprContext, getActiveSubgraphName)
 	}
 
 	if breaker.HasCircuits() {
-		baseRoundTripper = circuit.NewCircuitTripper(baseRoundTripper, breaker, getRequestContextLogger)
+		baseRoundTripper = circuit.NewCircuitTripper(baseRoundTripper, breaker, getRequestContextLogger, getActiveSubgraphName)
 	}
 
 	if retryManager.IsEnabled() {
-		ct.roundTripper = retrytransport.NewRetryHTTPTransport(baseRoundTripper, getRequestContextLogger, retryManager)
+		ct.roundTripper = retrytransport.NewRetryHTTPTransport(baseRoundTripper, getRequestContextLogger, retryManager, getActiveSubgraphName)
 	} else {
 		ct.roundTripper = baseRoundTripper
 	}
diff --git a/router/internal/circuit/breaker.go b/router/internal/circuit/breaker.go
index bbabdb42f2..382f355032 100644
--- a/router/internal/circuit/breaker.go
+++ b/router/internal/circuit/breaker.go
@@ -4,34 +4,27 @@ import (
 	"context"
 	"net/http"
 
-	rcontext "github.com/wundergraph/cosmo/router/internal/context"
 	"go.uber.org/zap"
 )
 
 type Breaker struct {
-	roundTripper   http.RoundTripper
-	loggerFunc     func(req *http.Request) *zap.Logger
-	circuitBreaker *Manager
+	roundTripper          http.RoundTripper
+	loggerFunc            func(req *http.Request) *zap.Logger
+	circuitBreaker        *Manager
+	getActiveSubgraphName func(req *http.Request) string
 }
 
-func NewCircuitTripper(roundTripper http.RoundTripper, breaker *Manager, logger func(req *http.Request) *zap.Logger) *Breaker {
+func NewCircuitTripper(roundTripper http.RoundTripper, breaker *Manager, logger func(req *http.Request) *zap.Logger, getActiveSubgraphName func(req *http.Request) string) *Breaker {
 	return &Breaker{
-		circuitBreaker: breaker,
-		loggerFunc:     logger,
-		roundTripper:   roundTripper,
+		circuitBreaker:        breaker,
+		loggerFunc:            logger,
+		roundTripper:          roundTripper,
+		getActiveSubgraphName: getActiveSubgraphName,
 	}
 }
 
 func (rt *Breaker) RoundTrip(req *http.Request) (resp *http.Response, err error) {
-	ctx := req.Context()
-
-	var subgraph string
-	subgraphCtxVal := ctx.Value(rcontext.CurrentSubgraphContextKey{})
-	if subgraphCtxVal != nil {
-		if sg, ok := subgraphCtxVal.(string); ok {
-			subgraph = sg
-		}
-	}
+	subgraph := rt.getActiveSubgraphName(req)
 
 	// If there is no circuit defined for this subgraph
 	circuit := rt.circuitBreaker.GetCircuitBreaker(subgraph)
diff --git a/router/internal/context/keys.go b/router/internal/context/keys.go
index d1706f4ab4..3524280e02 100644
--- a/router/internal/context/keys.go
+++ b/router/internal/context/keys.go
@@ -4,8 +4,6 @@
 // instead of being moved into core
 package context
 
-type CurrentSubgraphContextKey struct{}
-
 type ContextKey int
 
 const (
diff --git a/router/internal/retrytransport/retry_transport.go b/router/internal/retrytransport/retry_transport.go
index 83cdae16c4..26fcee92e3 100644
--- a/router/internal/retrytransport/retry_transport.go
+++ b/router/internal/retrytransport/retry_transport.go
@@ -7,16 +7,15 @@ import (
 	"strconv"
 	"time"
 
-	rcontext "github.com/wundergraph/cosmo/router/internal/context"
-
 	"github.com/cloudflare/backoff"
 	"go.uber.org/zap"
 )
 
 type RetryHTTPTransport struct {
-	roundTripper     http.RoundTripper
-	getRequestLogger requestLoggerGetter
-	retryManager     *Manager
+	roundTripper      http.RoundTripper
+	getRequestLogger  requestLoggerGetter
+	retryManager      *Manager
+	getActiveSubgraph func(req *http.Request) string
 }
 
 // parseRetryAfterHeader parses the Retry-After header value according to RFC 7231.
@@ -74,15 +73,12 @@ func shouldUseRetryAfter(logger *zap.Logger, resp *http.Response, maxDuration ti
 	return duration, duration > 0
 }
 
-func NewRetryHTTPTransport(
-	roundTripper http.RoundTripper,
-	getRequestLogger requestLoggerGetter,
-	retryManager *Manager,
-) *RetryHTTPTransport {
+func NewRetryHTTPTransport(roundTripper http.RoundTripper, getRequestLogger requestLoggerGetter, retryManager *Manager, getActiveSubgraph func(req *http.Request) string) *RetryHTTPTransport {
 	return &RetryHTTPTransport{
-		roundTripper:     roundTripper,
-		getRequestLogger: getRequestLogger,
-		retryManager:     retryManager,
+		roundTripper:      roundTripper,
+		getRequestLogger:  getRequestLogger,
+		getActiveSubgraph: getActiveSubgraph,
+		retryManager:      retryManager,
 	}
 }
 
@@ -93,16 +89,10 @@ func (rt *RetryHTTPTransport) RoundTrip(req *http.Request) (*http.Response, erro
 		return resp, nil
 	}
 
-	var subgraph string
-	subgraphCtxVal := req.Context().Value(rcontext.CurrentSubgraphContextKey{})
-	if subgraphCtxVal != nil {
-		if sg, ok := subgraphCtxVal.(string); ok {
-			subgraph = sg
-		}
-	}
+	activeSubgraph := rt.getActiveSubgraph(req)
+	retryOptions := rt.retryManager.GetSubgraphOptions(activeSubgraph)
 
 	// If there is no option defined for this subgraph
-	retryOptions := rt.retryManager.GetSubgraphOptions(subgraph)
 	if retryOptions == nil {
 		return resp, err
 	}
@@ -115,6 +105,7 @@ func (rt *RetryHTTPTransport) RoundTrip(req *http.Request) (*http.Response, erro
 	// Retry logic
 	retries := 0
 	for (rt.retryManager.Retry(err, req, resp, retryOptions.Expression)) && retries < retryOptions.MaxRetryCount {
+
 		retries++
 
 		// Check if we should use Retry-After header for 429 responses
diff --git a/router/internal/retrytransport/retry_transport_test.go b/router/internal/retrytransport/retry_transport_test.go
index 5aa9163d49..e58a1228f2 100644
--- a/router/internal/retrytransport/retry_transport_test.go
+++ b/router/internal/retrytransport/retry_transport_test.go
@@ -2,7 +2,6 @@ package retrytransport
 
 import (
 	"bytes"
-	"context"
 	"errors"
 	"fmt"
 	"io"
@@ -17,12 +16,19 @@ import (
 	"github.com/stretchr/testify/assert"
 	"go.uber.org/zap"
 
-	rcontext "github.com/wundergraph/cosmo/router/internal/context"
 	"github.com/wundergraph/cosmo/router/internal/expr"
 )
 
 const defaultMaxDuration = 100 * time.Second
 
+var loggerFunc = func(req *http.Request) *zap.Logger { return zap.NewNop() }
+
+var getActiveSubgraph = func(subgraphName string) func(req *http.Request) string {
+	return func(req *http.Request) string {
+		return subgraphName
+	}
+}
+
 // simpleShouldRetry provides simple retry logic for testing the transport implementation
 func simpleShouldRetry(err error, req *http.Request, resp *http.Response) bool {
 	// Simple logic for testing - retry on 5xx status codes or any error
@@ -55,11 +61,6 @@ func (dt *MockTransport) RoundTrip(req *http.Request) (*http.Response, error) {
 	return dt.handler(req)
 }
 
-func withSubgraph(req *http.Request, name string) *http.Request {
-	ctx := context.WithValue(req.Context(), rcontext.CurrentSubgraphContextKey{}, name)
-	return req.WithContext(ctx)
-}
-
 func newTestManager(shouldRetry func(error, *http.Request, *http.Response) bool, onRetry OnRetryFunc, opts RetryOptions, subgraphName string) *Manager {
 	mgr := NewManager(expr.NewRetryExpressionManager(), func(err error, req *http.Request, resp *http.Response, exprString string) bool {
 		return shouldRetry(err, req, resp)
@@ -74,31 +75,28 @@ func TestRetryOnHTTP5xx(t *testing.T) {
 	attemptCount := 0
 	maxRetries := 3
 
+	subgraphName := "sg"
 	mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
 		retries++
-	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg")
-
-	tr := NewRetryHTTPTransport(
-		&MockTransport{
-			handler: func(req *http.Request) (*http.Response, error) {
-				attemptCount++
-				if attemptCount <= maxRetries {
-					// Return 500 to trigger retry
-					return &http.Response{
-						StatusCode: http.StatusInternalServerError,
-					}, nil
-				}
-				// Finally return success
+	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName)
+
+	tr := NewRetryHTTPTransport(&MockTransport{
+		handler: func(req *http.Request) (*http.Response, error) {
+			attemptCount++
+			if attemptCount <= maxRetries {
+				// Return 500 to trigger retry
 				return &http.Response{
-					StatusCode: http.StatusOK,
+					StatusCode: http.StatusInternalServerError,
 				}, nil
-			},
+			}
+			// Finally return success
+			return &http.Response{
+				StatusCode: http.StatusOK,
+			}, nil
 		},
-		func(req *http.Request) *zap.Logger { return zap.NewNop() },
-		mgr,
-	)
+	}, loggerFunc, mgr, getActiveSubgraph(subgraphName))
 
-	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
+	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
 
 	resp, err := tr.RoundTrip(req)
 	assert.Nil(t, err)
@@ -114,29 +112,26 @@ func TestRetryOnErrors(t *testing.T) {
 	attemptCount := 0
 	maxRetries := 3
 
+	subgraphName := "sg"
 	mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
 		retries++
-	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg")
-
-	tr := NewRetryHTTPTransport(
-		&MockTransport{
-			handler: func(req *http.Request) (*http.Response, error) {
-				attemptCount++
-				if attemptCount <= maxRetries {
-					// Return any error to trigger retry
-					return nil, errors.New("some network error")
-				}
-				// Finally return success
-				return &http.Response{
-					StatusCode: http.StatusOK,
-				}, nil
-			},
+	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName)
+
+	tr := NewRetryHTTPTransport(&MockTransport{
+		handler: func(req *http.Request) (*http.Response, error) {
+			attemptCount++
+			if attemptCount <= maxRetries {
+				// Return any error to trigger retry
+				return nil, errors.New("some network error")
+			}
+			// Finally return success
+			return &http.Response{
+				StatusCode: http.StatusOK,
+			}, nil
 		},
-		func(req *http.Request) *zap.Logger { return zap.NewNop() },
-		mgr,
-	)
+	}, loggerFunc, mgr, getActiveSubgraph(subgraphName))
 
-	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
+	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
 
 	resp, err := tr.RoundTrip(req)
 	assert.Nil(t, err)
@@ -162,32 +157,29 @@ func TestDoNotRetryWhenShouldRetryReturnsFalse(t *testing.T) {
 		return simpleShouldRetry(err, req, resp)
 	}
 
+	subgraphName := "sg"
 	mgr := newTestManager(shouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
 		retries++
-	}, RetryOptions{MaxRetryCount: maxRetryCount, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg")
-
-	tr := NewRetryHTTPTransport(
-		&MockTransport{
-			handler: func(req *http.Request) (*http.Response, error) {
-				attemptCount++
-				switch attemptCount {
-				case 1:
-					// First attempt: return retryable error
-					return nil, errors.New("retryable error")
-				case 2:
-					// Second attempt: return retryable status code
-					return &http.Response{StatusCode: http.StatusInternalServerError}, nil
-				default:
-					// Third attempt: return non-retryable error (should stop retrying)
-					return nil, nonRetryableError
-				}
-			},
+	}, RetryOptions{MaxRetryCount: maxRetryCount, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName)
+
+	tr := NewRetryHTTPTransport(&MockTransport{
+		handler: func(req *http.Request) (*http.Response, error) {
+			attemptCount++
+			switch attemptCount {
+			case 1:
+				// First attempt: return retryable error
+				return nil, errors.New("retryable error")
+			case 2:
+				// Second attempt: return retryable status code
+				return &http.Response{StatusCode: http.StatusInternalServerError}, nil
+			default:
+				// Third attempt: return non-retryable error (should stop retrying)
+				return nil, nonRetryableError
+			}
 		},
-		func(req *http.Request) *zap.Logger { return zap.NewNop() },
-		mgr,
-	)
+	}, loggerFunc, mgr, getActiveSubgraph(subgraphName))
 
-	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
+	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
 
 	resp, err := tr.RoundTrip(req)
 	assert.Error(t, err)
@@ -232,26 +224,23 @@ func (b *TrackableBody) Close() error {
 func TestShortCircuitOnSuccess(t *testing.T) {
 	attemptCount := 0
 
+	subgraphName := "sg"
 	mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
 		t.Error("OnRetry should not be called when first request succeeds")
-	}, RetryOptions{MaxRetryCount: 5, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg")
-
-	tr := NewRetryHTTPTransport(
-		&MockTransport{
-			handler: func(req *http.Request) (*http.Response, error) {
-				attemptCount++
-				// Always return success
-				return &http.Response{
-					StatusCode: http.StatusOK,
-					Body:       io.NopCloser(strings.NewReader("success")),
-				}, nil
-			},
+	}, RetryOptions{MaxRetryCount: 5, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName)
+
+	tr := NewRetryHTTPTransport(&MockTransport{
+		handler: func(req *http.Request) (*http.Response, error) {
+			attemptCount++
+			// Always return success
+			return &http.Response{
+				StatusCode: http.StatusOK,
+				Body:       io.NopCloser(strings.NewReader("success")),
+			}, nil
 		},
-		func(req *http.Request) *zap.Logger { return zap.NewNop() },
-		mgr,
-	)
+	}, loggerFunc, mgr, getActiveSubgraph(subgraphName))
 
-	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
+	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
 	resp, err := tr.RoundTrip(req)
 
 	assert.NoError(t, err)
@@ -270,23 +259,20 @@ func TestMaxRetryCountRespected(t *testing.T) {
 	retries := 0
 	attemptCount := 0
 
+	subgraphName := "sg"
 	mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
 		retries++
-	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg")
-
-	tr := NewRetryHTTPTransport(
-		&MockTransport{
-			handler: func(req *http.Request) (*http.Response, error) {
-				attemptCount++
-				// Always return retryable error to test max retry limit
-				return nil, errors.New("always fail")
-			},
+	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName)
+
+	tr := NewRetryHTTPTransport(&MockTransport{
+		handler: func(req *http.Request) (*http.Response, error) {
+			attemptCount++
+			// Always return retryable error to test max retry limit
+			return nil, errors.New("always fail")
 		},
-		func(req *http.Request) *zap.Logger { return zap.NewNop() },
-		mgr,
-	)
+	}, loggerFunc, mgr, getActiveSubgraph(subgraphName))
 
-	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
+	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
 	resp, err := tr.RoundTrip(req)
 
 	assert.Error(t, err)
@@ -311,32 +297,29 @@ func TestResponseBodyDraining(t *testing.T) {
 		}
 	}
 
+	subgraphName := "sg"
 	mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
 		actualRetries++
-	}, RetryOptions{MaxRetryCount: retryCount, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg")
-
-	tr := NewRetryHTTPTransport(
-		&MockTransport{
-			handler: func(req *http.Request) (*http.Response, error) {
-				index++
-				if index < retryCount {
-					return &http.Response{
-						StatusCode: http.StatusInternalServerError,
-						Body:       bodies[index],
-					}, nil
-				} else {
-					return &http.Response{
-						StatusCode: http.StatusOK,
-						Body:       bodies[index],
-					}, nil
-				}
-			},
+	}, RetryOptions{MaxRetryCount: retryCount, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName)
+
+	tr := NewRetryHTTPTransport(&MockTransport{
+		handler: func(req *http.Request) (*http.Response, error) {
+			index++
+			if index < retryCount {
+				return &http.Response{
+					StatusCode: http.StatusInternalServerError,
+					Body:       bodies[index],
+				}, nil
+			} else {
+				return &http.Response{
+					StatusCode: http.StatusOK,
+					Body:       bodies[index],
+				}, nil
+			}
 		},
-		func(req *http.Request) *zap.Logger { return zap.NewNop() },
-		mgr,
-	)
+	}, loggerFunc, mgr, getActiveSubgraph(subgraphName))
 
-	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
+	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
 
 	resp, err := tr.RoundTrip(req)
 	assert.Nil(t, err)
@@ -380,34 +363,32 @@ func TestRequestLoggerIsUsed(t *testing.T) {
 		bodies[i] = trackableBody
 	}
 
+	subgraphName := "sg"
 	mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
 		actualRetries++
-	}, RetryOptions{MaxRetryCount: retryCount, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg")
-
-	tr := NewRetryHTTPTransport(
-		&MockTransport{
-			handler: func(req *http.Request) (*http.Response, error) {
-				index++
-				if index < retryCount {
-					return &http.Response{
-						StatusCode: http.StatusInternalServerError,
-						Body:       bodies[index],
-					}, nil
-				} else {
-					return &http.Response{
-						StatusCode: http.StatusOK,
-						Body:       bodies[index],
-					}, nil
-				}
-			},
+	}, RetryOptions{MaxRetryCount: retryCount, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName)
+
+	tr := NewRetryHTTPTransport(&MockTransport{
+		handler: func(req *http.Request) (*http.Response, error) {
+			index++
+			if index < retryCount {
+				return &http.Response{
+					StatusCode: http.StatusInternalServerError,
+					Body:       bodies[index],
+				}, nil
+			} else {
+				return &http.Response{
+					StatusCode: http.StatusOK,
+					Body:       bodies[index],
+				}, nil
+			}
 		},
-		func(req *http.Request) *zap.Logger { return requestLogger },
-		mgr,
-	)
+	}, func(req *http.Request) *zap.Logger { return requestLogger }, mgr, getActiveSubgraph(subgraphName))
 
-	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
+	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
 
-	tr.RoundTrip(req)
+	_, err := tr.RoundTrip(req)
+	assert.NoError(t, err)
 
 	assert.Contains(t, requestLoggerBuf.String(), "Failed draining when discarding the body\t{\"error\": \"retry read error, index: 1\"}")
 	assert.Contains(t, requestLoggerBuf.String(), "Failed draining when closing the body\t{\"error\": \"retry close error, index: 2\"}")
@@ -435,6 +416,7 @@ func TestOnRetryCallbackInvoked(t *testing.T) {
 		resp  *http.Response
 	}
 
+	subgraphName := "sg"
 	mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
 		retries++
 		retryCallbacks = append(retryCallbacks, struct {
@@ -442,26 +424,22 @@ func TestOnRetryCallbackInvoked(t *testing.T) {
 			err   error
 			resp  *http.Response
 		}{count: count, err: err, resp: resp})
-	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg")
-
-	tr := NewRetryHTTPTransport(
-		&MockTransport{
-			handler: func(req *http.Request) (*http.Response, error) {
-				if retries < maxRetries {
-					// Return retryable error
-					return nil, errors.New("retryable error")
-				}
-				// Finally return success
-				return &http.Response{
-					StatusCode: http.StatusOK,
-				}, nil
-			},
+	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName)
+
+	tr := NewRetryHTTPTransport(&MockTransport{
+		handler: func(req *http.Request) (*http.Response, error) {
+			if retries < maxRetries {
+				// Return retryable error
+				return nil, errors.New("retryable error")
+			}
+			// Finally return success
+			return &http.Response{
+				StatusCode: http.StatusOK,
+			}, nil
 		},
-		func(req *http.Request) *zap.Logger { return zap.NewNop() },
-		mgr,
-	)
+	}, loggerFunc, mgr, getActiveSubgraph(subgraphName))
 
-	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
+	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
 	resp, err := tr.RoundTrip(req)
 
 	assert.NoError(t, err)
@@ -489,41 +467,38 @@ func TestRetryOn429WithDelaySeconds(t *testing.T) {
 	// Track what retry duration was requested to verify Retry-After is parsed correctly
 	var retryAfterUsed []bool
 
+	subgraphName := `sg`
 	mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
 		retries++
-	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 100 * time.Millisecond, MaxDuration: 10 * time.Second}, "sg")
-
-	tr := NewRetryHTTPTransport(
-		&MockTransport{
-			handler: func(req *http.Request) (*http.Response, error) {
-				attemptCount++
-				if attemptCount <= maxRetries {
-					// Return 429 with Retry-After header in seconds
-					resp := &http.Response{
-						StatusCode: http.StatusTooManyRequests,
-						Header:     make(http.Header),
-					}
-					resp.Header.Set("Retry-After", fmt.Sprintf("%d", retryAfterSeconds))
-
-					// Verify the header is parsed correctly
-					duration, useRetryAfter := shouldUseRetryAfter(zap.NewNop(), resp, defaultMaxDuration)
-					retryAfterUsed = append(retryAfterUsed, useRetryAfter)
-					assert.True(t, useRetryAfter, "Should use Retry-After header for 429")
-					assert.Equal(t, time.Duration(retryAfterSeconds)*time.Second, duration)
-
-					return resp, nil
+	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 100 * time.Millisecond, MaxDuration: 10 * time.Second}, subgraphName)
+
+	tr := NewRetryHTTPTransport(&MockTransport{
+		handler: func(req *http.Request) (*http.Response, error) {
+			attemptCount++
+			if attemptCount <= maxRetries {
+				// Return 429 with Retry-After header in seconds
+				resp := &http.Response{
+					StatusCode: http.StatusTooManyRequests,
+					Header:     make(http.Header),
 				}
-				// Finally return success
-				return &http.Response{
-					StatusCode: http.StatusOK,
-				}, nil
-			},
+				resp.Header.Set("Retry-After", fmt.Sprintf("%d", retryAfterSeconds))
+
+				// Verify the header is parsed correctly
+				duration, useRetryAfter := shouldUseRetryAfter(zap.NewNop(), resp, defaultMaxDuration)
+				retryAfterUsed = append(retryAfterUsed, useRetryAfter)
+				assert.True(t, useRetryAfter, "Should use Retry-After header for 429")
+				assert.Equal(t, time.Duration(retryAfterSeconds)*time.Second, duration)
+
+				return resp, nil
+			}
+			// Finally return success
+			return &http.Response{
+				StatusCode: http.StatusOK,
+			}, nil
 		},
-		func(req *http.Request) *zap.Logger { return zap.NewNop() },
-		mgr,
-	)
+	}, loggerFunc, mgr, getActiveSubgraph(subgraphName))
 
-	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
+	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
 	resp, err := tr.RoundTrip(req)
 
 	assert.NoError(t, err)
@@ -547,41 +522,38 @@ func TestRetryOn429WithDelaySecondsLargerThanMaxDuration(t *testing.T) {
 	// Track what retry duration was requested to verify Retry-After is parsed correctly
 	var retryAfterUsed []bool
 
+	subgraphName := "sg"
 	mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
 		retries++
-	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 100 * time.Millisecond, MaxDuration: 10 * time.Second}, "sg")
-
-	tr := NewRetryHTTPTransport(
-		&MockTransport{
-			handler: func(req *http.Request) (*http.Response, error) {
-				attemptCount++
-				if attemptCount <= maxRetries {
-					// Return 429 with Retry-After header in seconds
-					resp := &http.Response{
-						StatusCode: http.StatusTooManyRequests,
-						Header:     make(http.Header),
-					}
-					resp.Header.Set("Retry-After", fmt.Sprintf("%d", retryAfterSeconds))
-
-					// Verify the header is parsed correctly
-					duration, useRetryAfter := shouldUseRetryAfter(zap.NewNop(), resp, maxDuration)
-					retryAfterUsed = append(retryAfterUsed, useRetryAfter)
-					assert.True(t, useRetryAfter, "Should use Retry-After header for 429")
-					assert.Equal(t, maxDuration, duration)
-
-					return resp, nil
+	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 100 * time.Millisecond, MaxDuration: 10 * time.Second}, subgraphName)
+
+	tr := NewRetryHTTPTransport(&MockTransport{
+		handler: func(req *http.Request) (*http.Response, error) {
+			attemptCount++
+			if attemptCount <= maxRetries {
+				// Return 429 with Retry-After header in seconds
+				resp := &http.Response{
+					StatusCode: http.StatusTooManyRequests,
+					Header:     make(http.Header),
 				}
-				// Finally return success
-				return &http.Response{
-					StatusCode: http.StatusOK,
-				}, nil
-			},
+				resp.Header.Set("Retry-After", fmt.Sprintf("%d", retryAfterSeconds))
+
+				// Verify the header is parsed correctly
+				duration, useRetryAfter := shouldUseRetryAfter(zap.NewNop(), resp, maxDuration)
+				retryAfterUsed = append(retryAfterUsed, useRetryAfter)
+				assert.True(t, useRetryAfter, "Should use Retry-After header for 429")
+				assert.Equal(t, maxDuration, duration)
+
+				return resp, nil
+			}
+			// Finally return success
+			return &http.Response{
+				StatusCode: http.StatusOK,
+			}, nil
 		},
-		func(req *http.Request) *zap.Logger { return zap.NewNop() },
-		mgr,
-	)
+	}, loggerFunc, mgr, getActiveSubgraph(subgraphName))
 
-	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
+	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
 	resp, err := tr.RoundTrip(req)
 
 	assert.NoError(t, err)
@@ -600,32 +572,29 @@ func TestRetryOn429WithoutRetryAfter(t *testing.T) {
 	attemptCount := 0
 	maxRetries := 2
 
+	subgraphName := "sg"
 	mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
 		retries++
-	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg")
-
-	tr := NewRetryHTTPTransport(
-		&MockTransport{
-			handler: func(req *http.Request) (*http.Response, error) {
-				attemptCount++
-				if attemptCount <= maxRetries {
-					// Return 429 without Retry-After header
-					return &http.Response{
-						StatusCode: http.StatusTooManyRequests,
-						Header:     make(http.Header),
-					}, nil
-				}
-				// Finally return success
+	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName)
+
+	tr := NewRetryHTTPTransport(&MockTransport{
+		handler: func(req *http.Request) (*http.Response, error) {
+			attemptCount++
+			if attemptCount <= maxRetries {
+				// Return 429 without Retry-After header
 				return &http.Response{
-					StatusCode: http.StatusOK,
+					StatusCode: http.StatusTooManyRequests,
+					Header:     make(http.Header),
 				}, nil
-			},
+			}
+			// Finally return success
+			return &http.Response{
+				StatusCode: http.StatusOK,
+			}, nil
 		},
-		func(req *http.Request) *zap.Logger { return zap.NewNop() },
-		mgr,
-	)
+	}, loggerFunc, mgr, getActiveSubgraph(subgraphName))
 
-	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
+	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
 	resp, err := tr.RoundTrip(req)
 
 	assert.NoError(t, err)
@@ -644,45 +613,42 @@ func TestRetryOn429WithHTTPDate(t *testing.T) {
 	var retryAfterUsed []bool
 	var expectedDuration time.Duration
 
+	subgraphName := "sg"
 	mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
 		retries++
-	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 100 * time.Millisecond, MaxDuration: 10 * time.Second}, "sg")
-
-	tr := NewRetryHTTPTransport(
-		&MockTransport{
-			handler: func(req *http.Request) (*http.Response, error) {
-				attemptCount++
-				if attemptCount <= maxRetries {
-					// Return 429 with Retry-After header as HTTP-date (1 second in future to keep test fast)
-					expectedDuration = 1 * time.Second
-					futureTime := time.Now().UTC().Add(expectedDuration)
-					resp := &http.Response{
-						StatusCode: http.StatusTooManyRequests,
-						Header:     make(http.Header),
-					}
-					resp.Header.Set("Retry-After", futureTime.Format(http.TimeFormat))
-
-					// Verify the header is parsed correctly
-					duration, useRetryAfter := shouldUseRetryAfter(zap.NewNop(), resp, defaultMaxDuration)
-					retryAfterUsed = append(retryAfterUsed, useRetryAfter)
-					assert.True(t, useRetryAfter, "Should use Retry-After header for 429")
-					// Allow reasonable tolerance for execution delay between time creation and parsing
-					assert.True(t, duration > 0 && duration <= expectedDuration,
-						"Duration should be positive and <= %v, got %v", expectedDuration, duration)
-
-					return resp, nil
+	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 100 * time.Millisecond, MaxDuration: 10 * time.Second}, subgraphName)
+
+	tr := NewRetryHTTPTransport(&MockTransport{
+		handler: func(req *http.Request) (*http.Response, error) {
+			attemptCount++
+			if attemptCount <= maxRetries {
+				// Return 429 with Retry-After header as HTTP-date (1 second in future to keep test fast)
+				expectedDuration = 1 * time.Second
+				futureTime := time.Now().UTC().Add(expectedDuration)
+				resp := &http.Response{
+					StatusCode: http.StatusTooManyRequests,
+					Header:     make(http.Header),
 				}
-				// Finally return success
-				return &http.Response{
-					StatusCode: http.StatusOK,
-				}, nil
-			},
+				resp.Header.Set("Retry-After", futureTime.Format(http.TimeFormat))
+
+				// Verify the header is parsed correctly
+				duration, useRetryAfter := shouldUseRetryAfter(zap.NewNop(), resp, defaultMaxDuration)
+				retryAfterUsed = append(retryAfterUsed, useRetryAfter)
+				assert.True(t, useRetryAfter, "Should use Retry-After header for 429")
+				// Allow reasonable tolerance for execution delay between time creation and parsing
+				assert.True(t, duration > 0 && duration <= expectedDuration,
+					"Duration should be positive and <= %v, got %v", expectedDuration, duration)
+
+				return resp, nil
+			}
+			// Finally return success
+			return &http.Response{
+				StatusCode: http.StatusOK,
+			}, nil
 		},
-		func(req *http.Request) *zap.Logger { return zap.NewNop() },
-		mgr,
-	)
+	}, loggerFunc, mgr, getActiveSubgraph(subgraphName))
 
-	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
+	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
 	resp, err := tr.RoundTrip(req)
 
 	assert.NoError(t, err)
@@ -701,34 +667,31 @@ func TestRetryOn429WithInvalidRetryAfterHeader(t *testing.T) {
 	attemptCount := 0
 	maxRetries := 2
 
+	subgraphName := "sg"
 	mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
 		retries++
-	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg")
-
-	tr := NewRetryHTTPTransport(
-		&MockTransport{
-			handler: func(req *http.Request) (*http.Response, error) {
-				attemptCount++
-				if attemptCount <= maxRetries {
-					// Return 429 with invalid Retry-After header
-					resp := &http.Response{
-						StatusCode: http.StatusTooManyRequests,
-						Header:     make(http.Header),
-					}
-					resp.Header.Set("Retry-After", "invalid-value")
-					return resp, nil
+	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName)
+
+	tr := NewRetryHTTPTransport(&MockTransport{
+		handler: func(req *http.Request) (*http.Response, error) {
+			attemptCount++
+			if attemptCount <= maxRetries {
+				// Return 429 with invalid Retry-After header
+				resp := &http.Response{
+					StatusCode: http.StatusTooManyRequests,
+					Header:     make(http.Header),
 				}
-				// Finally return success
-				return &http.Response{
-					StatusCode: http.StatusOK,
-				}, nil
-			},
+				resp.Header.Set("Retry-After", "invalid-value")
+				return resp, nil
+			}
+			// Finally return success
+			return &http.Response{
+				StatusCode: http.StatusOK,
+			}, nil
 		},
-		func(req *http.Request) *zap.Logger { return zap.NewNop() },
-		mgr,
-	)
+	}, loggerFunc, mgr, getActiveSubgraph(subgraphName))
 
-	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
+	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
 	resp, err := tr.RoundTrip(req)
 
 	assert.NoError(t, err)
@@ -744,34 +707,31 @@ func TestRetryOn429WithNegativeDelaySeconds(t *testing.T) {
 	attemptCount := 0
 	maxRetries := 2
 
+	subgraphName := "sg"
 	mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
 		retries++
-	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg")
-
-	tr := NewRetryHTTPTransport(
-		&MockTransport{
-			handler: func(req *http.Request) (*http.Response, error) {
-				attemptCount++
-				if attemptCount <= maxRetries {
-					// Return 429 with negative Retry-After value (should fall back to normal backoff)
-					resp := &http.Response{
-						StatusCode: http.StatusTooManyRequests,
-						Header:     make(http.Header),
-					}
-					resp.Header.Set("Retry-After", "-1")
-					return resp, nil
+	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName)
+
+	tr := NewRetryHTTPTransport(&MockTransport{
+		handler: func(req *http.Request) (*http.Response, error) {
+			attemptCount++
+			if attemptCount <= maxRetries {
+				// Return 429 with negative Retry-After value (should fall back to normal backoff)
+				resp := &http.Response{
+					StatusCode: http.StatusTooManyRequests,
+					Header:     make(http.Header),
 				}
-				// Finally return success
-				return &http.Response{
-					StatusCode: http.StatusOK,
-				}, nil
-			},
+				resp.Header.Set("Retry-After", "-1")
+				return resp, nil
+			}
+			// Finally return success
+			return &http.Response{
+				StatusCode: http.StatusOK,
+			}, nil
 		},
-		func(req *http.Request) *zap.Logger { return zap.NewNop() },
-		mgr,
-	)
+	}, loggerFunc, mgr, getActiveSubgraph(subgraphName))
 
-	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
+	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
 	resp, err := tr.RoundTrip(req)
 
 	assert.NoError(t, err)
@@ -789,62 +749,59 @@ func TestRetryMixed429AndOtherErrors(t *testing.T) {
 	// Track which responses used Retry-After vs normal backoff
 	var retryAfterUsedPerAttempt []bool
 
+	subgraphName := "sg"
 	mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
 		retries++
-	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 10 * time.Millisecond, MaxDuration: 10 * time.Second}, "sg")
-
-	tr := NewRetryHTTPTransport(
-		&MockTransport{
-			handler: func(req *http.Request) (*http.Response, error) {
-				attemptCount++
-				switch attemptCount {
-				case 1:
-					// First: 429 with Retry-After
-					resp := &http.Response{
-						StatusCode: http.StatusTooManyRequests,
-						Header:     make(http.Header),
-					}
-					resp.Header.Set("Retry-After", "1")
-
-					// Verify this should use Retry-After
-					_, useRetryAfter := shouldUseRetryAfter(zap.NewNop(), resp, defaultMaxDuration)
-					retryAfterUsedPerAttempt = append(retryAfterUsedPerAttempt, useRetryAfter)
-
-					return resp, nil
-				case 2:
-					// Second: Network error (should use normal backoff)
-					retryAfterUsedPerAttempt = append(retryAfterUsedPerAttempt, false)
-					return nil, errors.New("network error")
-				case 3:
-					// Third: 500 error (should use normal backoff)
-					resp := &http.Response{
-						StatusCode: http.StatusInternalServerError,
-					}
-					_, useRetryAfter := shouldUseRetryAfter(zap.NewNop(), resp, defaultMaxDuration)
-					retryAfterUsedPerAttempt = append(retryAfterUsedPerAttempt, useRetryAfter)
-					return resp, nil
-				case 4:
-					// Fourth: 429 without Retry-After (should use normal backoff)
-					resp := &http.Response{
-						StatusCode: http.StatusTooManyRequests,
-						Header:     make(http.Header),
-					}
-					_, useRetryAfter := shouldUseRetryAfter(zap.NewNop(), resp, defaultMaxDuration)
-					retryAfterUsedPerAttempt = append(retryAfterUsedPerAttempt, useRetryAfter)
-					return resp, nil
-				default:
-					// Finally: Success
-					return &http.Response{
-						StatusCode: http.StatusOK,
-					}, nil
+	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 10 * time.Millisecond, MaxDuration: 10 * time.Second}, subgraphName)
+
+	tr := NewRetryHTTPTransport(&MockTransport{
+		handler: func(req *http.Request) (*http.Response, error) {
+			attemptCount++
+			switch attemptCount {
+			case 1:
+				// First: 429 with Retry-After
+				resp := &http.Response{
+					StatusCode: http.StatusTooManyRequests,
+					Header:     make(http.Header),
 				}
-			},
+				resp.Header.Set("Retry-After", "1")
+
+				// Verify this should use Retry-After
+				_, useRetryAfter := shouldUseRetryAfter(zap.NewNop(), resp, defaultMaxDuration)
+				retryAfterUsedPerAttempt = append(retryAfterUsedPerAttempt, useRetryAfter)
+
+				return resp, nil
+			case 2:
+				// Second: Network error (should use normal backoff)
+				retryAfterUsedPerAttempt = append(retryAfterUsedPerAttempt, false)
+				return nil, errors.New("network error")
+			case 3:
+				// Third: 500 error (should use normal backoff)
+				resp := &http.Response{
+					StatusCode: http.StatusInternalServerError,
+				}
+				_, useRetryAfter := shouldUseRetryAfter(zap.NewNop(), resp, defaultMaxDuration)
+				retryAfterUsedPerAttempt = append(retryAfterUsedPerAttempt, useRetryAfter)
+				return resp, nil
+			case 4:
+				// Fourth: 429 without Retry-After (should use normal backoff)
+				resp := &http.Response{
+					StatusCode: http.StatusTooManyRequests,
+					Header:     make(http.Header),
+				}
+				_, useRetryAfter := shouldUseRetryAfter(zap.NewNop(), resp, defaultMaxDuration)
+				retryAfterUsedPerAttempt = append(retryAfterUsedPerAttempt, useRetryAfter)
+				return resp, nil
+			default:
+				// Finally: Success
+				return &http.Response{
+					StatusCode: http.StatusOK,
+				}, nil
+			}
 		},
-		func(req *http.Request) *zap.Logger { return zap.NewNop() },
-		mgr,
-	)
+	}, loggerFunc, mgr, getActiveSubgraph(subgraphName))
 
-	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
+	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
 	resp, err := tr.RoundTrip(req)
 
 	assert.NoError(t, err)
@@ -873,28 +830,25 @@ func TestNoRetryOn429WhenShouldRetryReturnsFalse(t *testing.T) {
 		return err != nil
 	}
 
+	subgraphName := `sg`
 	mgr := newTestManager(shouldNotRetry429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
 		retries++
-	}, RetryOptions{MaxRetryCount: 3, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, "sg")
+	}, RetryOptions{MaxRetryCount: 3, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName)
 
-	tr := NewRetryHTTPTransport(
-		&MockTransport{
-			handler: func(req *http.Request) (*http.Response, error) {
-				attemptCount++
-				// Always return 429 with Retry-After header
-				resp := &http.Response{
-					StatusCode: http.StatusTooManyRequests,
-					Header:     make(http.Header),
-				}
-				resp.Header.Set("Retry-After", "1")
-				return resp, nil
-			},
+	tr := NewRetryHTTPTransport(&MockTransport{
+		handler: func(req *http.Request) (*http.Response, error) {
+			attemptCount++
+			// Always return 429 with Retry-After header
+			resp := &http.Response{
+				StatusCode: http.StatusTooManyRequests,
+				Header:     make(http.Header),
+			}
+			resp.Header.Set("Retry-After", "1")
+			return resp, nil
 		},
-		func(req *http.Request) *zap.Logger { return zap.NewNop() },
-		mgr,
-	)
+	}, loggerFunc, mgr, getActiveSubgraph(subgraphName))
 
-	req := withSubgraph(httptest.NewRequest("GET", "http://localhost:3000/graphql", nil), "sg")
+	req := httptest.NewRequest("GET", "http://localhost:3000/graphql", nil)
 	resp, err := tr.RoundTrip(req)
 
 	assert.NoError(t, err)
diff --git a/router/internal/traceclient/traceclient.go b/router/internal/traceclient/traceclient.go
index d787473330..3207e3697f 100644
--- a/router/internal/traceclient/traceclient.go
+++ b/router/internal/traceclient/traceclient.go
@@ -6,7 +6,6 @@ import (
 	"net/http/httptrace"
 	"time"
 
-	rcontext "github.com/wundergraph/cosmo/router/internal/context"
 	"github.com/wundergraph/cosmo/router/internal/expr"
 
 	"github.com/wundergraph/cosmo/router/pkg/metric"
@@ -33,20 +32,23 @@ type ClientTrace struct {
 type ClientTraceContextKey struct{}
 
 type TraceInjectingRoundTripper struct {
-	base                   http.RoundTripper
-	connectionMetricStore  metric.ConnectionMetricStore
-	reqContextValuesGetter func(ctx context.Context, req *http.Request) (*expr.Context, string)
+	base                  http.RoundTripper
+	connectionMetricStore metric.ConnectionMetricStore
+	getExprContext        func(ctx context.Context, req *http.Request) *expr.Context
+	getActiveSubgraphName func(req *http.Request) string
 }
 
 func NewTraceInjectingRoundTripper(
 	base http.RoundTripper,
 	connectionMetricStore metric.ConnectionMetricStore,
-	reqContextValuesGetter func(ctx context.Context, req *http.Request) (*expr.Context, string),
+	getExprContext func(ctx context.Context, req *http.Request) *expr.Context,
+	getActiveSubgraphName func(req *http.Request) string,
 ) *TraceInjectingRoundTripper {
 	return &TraceInjectingRoundTripper{
-		base:                   base,
-		connectionMetricStore:  connectionMetricStore,
-		reqContextValuesGetter: reqContextValuesGetter,
+		base:                  base,
+		connectionMetricStore: connectionMetricStore,
+		getExprContext:        getExprContext,
+		getActiveSubgraphName: getActiveSubgraphName,
 	}
 }
 
@@ -104,23 +106,14 @@ func (t *TraceInjectingRoundTripper) getClientTrace(ctx context.Context) *httptr
 func (t *TraceInjectingRoundTripper) processConnectionMetrics(ctx context.Context, req *http.Request) {
 	trace := GetClientTraceFromContext(ctx)
 
-	var subgraph string
-	subgraphCtxVal := ctx.Value(rcontext.CurrentSubgraphContextKey{})
-	if subgraphCtxVal != nil {
-		subgraph = subgraphCtxVal.(string)
-	}
-
-	// We have a fallback for active subgraph name in case engine loader hooks is not called
-	// TODO: Evaluate if we actually need a fallback and if we can use only one way to get the active subgraph name
-	exprContext, activeSubgraphName := t.reqContextValuesGetter(ctx, req)
-	if subgraph == "" {
-		subgraph = activeSubgraphName
-	}
+	exprContext := t.getExprContext(ctx, req)
 
 	if trace.ConnectionGet != nil && trace.ConnectionAcquired != nil {
 		duration := trace.ConnectionAcquired.Time.Sub(trace.ConnectionGet.Time)
 		exprContext.Subgraph.Request.ClientTrace.ConnectionAcquireDuration = duration
 
+		subgraph := t.getActiveSubgraphName(req)
+		
 		serverAttributes := rotel.GetServerAttributes(trace.ConnectionGet.HostPort)
 		serverAttributes = append(
 			serverAttributes,

From bceee7efe357a6ffd07c87a2c20435a224cab526 Mon Sep 17 00:00:00 2001
From: Milinda Dias <skay.archon@gmail.com>
Date: Thu, 18 Sep 2025 01:36:01 +0530
Subject: [PATCH 14/24] fix: feature flags

---
 router-tests/retry_test.go                    | 49 +++++++++++++++++++
 router/core/graph_server.go                   |  2 +-
 router/internal/retrytransport/manager.go     | 33 ++++++++-----
 .../retrytransport/retry_transport.go         |  1 -
 4 files changed, 71 insertions(+), 14 deletions(-)

diff --git a/router-tests/retry_test.go b/router-tests/retry_test.go
index d52c3ef1ab..ae0186ab65 100644
--- a/router-tests/retry_test.go
+++ b/router-tests/retry_test.go
@@ -928,6 +928,55 @@ func TestRetryPerSubgraph(t *testing.T) {
 		})
 	})
 
+	t.Run("verify retries are applied when only all and no subgraph specific overrides are present", func(t *testing.T) {
+		t.Parallel()
+
+		calls := atomic.Int32{}
+
+		// Configure per-subgraph retry: employees gets 3 retries, test1 gets 1 retry
+		generalMax := 5
+		opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{
+			All: config.GlobalSubgraphRequestRule{
+				BackoffJitterRetry: config.BackoffJitterRetry{
+					Enabled:     true,
+					MaxAttempts: generalMax,
+					MaxDuration: 2 * time.Second,
+					Interval:    10 * time.Millisecond,
+					Expression:  "true",
+				},
+			},
+		})
+		options := core.WithSubgraphRetryOptions(opts)
+
+		testenv.Run(t, &testenv.Config{
+			NoRetryClient: true,
+			RouterOptions: []core.Option{
+				options,
+			},
+			Subgraphs: testenv.SubgraphsConfig{
+				ProductsFg: testenv.SubgraphConfig{
+					Middleware: func(_ http.Handler) http.Handler {
+						return http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
+							w.WriteHeader(http.StatusBadGateway)
+							calls.Add(1)
+						})
+					},
+				},
+			},
+		}, func(t *testing.T, xEnv *testenv.Environment) {
+			res, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{
+				Query: `query { employees { id products } }`,
+				Header: map[string][]string{
+					"X-Feature-Flag": {"myff"},
+				},
+			})
+			require.NoError(t, err)
+			require.Equal(t, res.Response.Header.Get("X-Feature-Flag"), "myff")
+			require.Contains(t, res.Body, `{"message":"Failed to fetch from Subgraph 'products_fg' at Path 'employees', Reason: empty response.","extensions":{"statusCode":502}}`)
+			require.Equal(t, int32(generalMax+1), calls.Load())
+		})
+	})
+
 	t.Run("verify retry on upgrade requests", func(t *testing.T) {
 		t.Parallel()
 
diff --git a/router/core/graph_server.go b/router/core/graph_server.go
index 99fdc21c97..44af5e43a3 100644
--- a/router/core/graph_server.go
+++ b/router/core/graph_server.go
@@ -276,7 +276,7 @@ func newGraphServer(ctx context.Context, r *Router, routerConfig *nodev1.RouterC
 		err = retryManager.Initialize(
 			s.retryOptions.All,
 			s.retryOptions.SubgraphMap,
-			routerConfig.GetSubgraphs(),
+			routerConfig,
 		)
 		if err != nil {
 			return nil, err
diff --git a/router/internal/retrytransport/manager.go b/router/internal/retrytransport/manager.go
index 1c7fdff1ad..9ca0555c3f 100644
--- a/router/internal/retrytransport/manager.go
+++ b/router/internal/retrytransport/manager.go
@@ -47,21 +47,30 @@ func NewManager(exprManager *expr.RetryExpressionManager, retryFunc ShouldRetryF
 	}
 }
 
-func (m *Manager) Initialize(
-	baseRetryOptions RetryOptions,
-	subgraphRetryOptions map[string]RetryOptions,
-	subgraphs []*nodev1.Subgraph,
-) error {
-	defaultSgNames := make([]string, 0, len(subgraphs))
-	customSgNames := make([]string, 0, len(subgraphs))
-
-	for _, subgraph := range subgraphs {
-		entry, ok := subgraphRetryOptions[subgraph.Name]
+func (m *Manager) Initialize(baseRetryOptions RetryOptions, subgraphRetryOptions map[string]RetryOptions, routerConfig *nodev1.RouterConfig) error {
+	// Get the list of all subgraph AND feature subgraphs
+	subgraphNameSet := make(map[string]bool, len(routerConfig.Subgraphs))
+	for _, subgraph := range routerConfig.GetSubgraphs() {
+		subgraphNameSet[subgraph.Name] = true
+	}
+	if routerConfig.FeatureFlagConfigs != nil {
+		for _, ffConfig := range routerConfig.FeatureFlagConfigs.ConfigByFeatureFlagName {
+			for _, subgraph := range ffConfig.GetSubgraphs() {
+				subgraphNameSet[subgraph.Name] = true
+			}
+		}
+	}
+
+	defaultSgNames := make([]string, 0, len(subgraphNameSet))
+	customSgNames := make([]string, 0, len(subgraphNameSet))
+
+	for subgraphName := range subgraphNameSet {
+		entry, ok := subgraphRetryOptions[subgraphName]
 		if !ok {
-			defaultSgNames = append(defaultSgNames, subgraph.Name)
+			defaultSgNames = append(defaultSgNames, subgraphName)
 		} else if entry.Enabled {
 			// This will cover the case of if a subgraph is explicitly disabled
-			customSgNames = append(customSgNames, subgraph.Name)
+			customSgNames = append(customSgNames, subgraphName)
 		}
 	}
 
diff --git a/router/internal/retrytransport/retry_transport.go b/router/internal/retrytransport/retry_transport.go
index 26fcee92e3..505eca2e28 100644
--- a/router/internal/retrytransport/retry_transport.go
+++ b/router/internal/retrytransport/retry_transport.go
@@ -105,7 +105,6 @@ func (rt *RetryHTTPTransport) RoundTrip(req *http.Request) (*http.Response, erro
 	// Retry logic
 	retries := 0
 	for (rt.retryManager.Retry(err, req, resp, retryOptions.Expression)) && retries < retryOptions.MaxRetryCount {
-
 		retries++
 
 		// Check if we should use Retry-After header for 429 responses

From 3131bb710298cb40ab56dff597c61c6f4e4f16e8 Mon Sep 17 00:00:00 2001
From: Milinda Dias <skay.archon@gmail.com>
Date: Thu, 18 Sep 2025 01:54:00 +0530
Subject: [PATCH 15/24] fix: linting

---
 router-tests/circuit_breaker_test.go |  6 +--
 router-tests/retry_test.go           | 70 ++++++++++++++--------------
 2 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/router-tests/circuit_breaker_test.go b/router-tests/circuit_breaker_test.go
index 2ee4ef21c0..b6d83f59b1 100644
--- a/router-tests/circuit_breaker_test.go
+++ b/router-tests/circuit_breaker_test.go
@@ -658,7 +658,7 @@ func TestCircuitBreaker(t *testing.T) {
 			},
 			Subgraphs: testenv.SubgraphsConfig{
 				Employees: testenv.SubgraphConfig{
-					Middleware: func(handler http.Handler) http.Handler {
+					Middleware: func(_ http.Handler) http.Handler {
 						return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 							employeesCalls.Add(1)
 							if employeesCalls.Load() <= failedTries {
@@ -667,7 +667,7 @@ func TestCircuitBreaker(t *testing.T) {
 							}
 
 							upgrader := websocket.Upgrader{
-								CheckOrigin: func(r *http.Request) bool {
+								CheckOrigin: func(_ *http.Request) bool {
 									return true
 								},
 								Subprotocols: []string{"graphql-transport-ws"},
@@ -706,7 +706,7 @@ func TestCircuitBreaker(t *testing.T) {
 				_, message, err := testenv.WSReadMessage(t, conn)
 				require.NoError(t, err)
 
-				require.Equal(t, defaultErrorMessage, string(message))
+				require.JSONEq(t, defaultErrorMessage, string(message))
 				require.NoError(t, conn.Close())
 
 				switch {
diff --git a/router-tests/retry_test.go b/router-tests/retry_test.go
index ae0186ab65..a679a643db 100644
--- a/router-tests/retry_test.go
+++ b/router-tests/retry_test.go
@@ -524,7 +524,7 @@ func TestRetryPerSubgraph(t *testing.T) {
 			RouterOptions: []core.Option{
 				options,
 			},
-		}, func(t *testing.T, xEnv *testenv.Environment) {
+		}, func(t *testing.T, _ *testenv.Environment) {
 			require.Fail(t, "expected initialization to fail due to invalid algorithm")
 		})
 
@@ -553,7 +553,7 @@ func TestRetryPerSubgraph(t *testing.T) {
 			RouterOptions: []core.Option{
 				options,
 			},
-		}, func(t *testing.T, xEnv *testenv.Environment) {})
+		}, func(_ *testing.T, _ *testenv.Environment) {})
 
 		require.NoError(t, err)
 	})
@@ -582,7 +582,7 @@ func TestRetryPerSubgraph(t *testing.T) {
 			RouterOptions: []core.Option{
 				options,
 			},
-		}, func(t *testing.T, xEnv *testenv.Environment) {})
+		}, func(_ *testing.T, _ *testenv.Environment) {})
 
 		require.NoError(t, err)
 	})
@@ -611,7 +611,7 @@ func TestRetryPerSubgraph(t *testing.T) {
 			RouterOptions: []core.Option{
 				options,
 			},
-		}, func(t *testing.T, xEnv *testenv.Environment) {
+		}, func(t *testing.T, _ *testenv.Environment) {
 			require.Fail(t, "expected initialization to fail due to invalid algorithm")
 		})
 
@@ -639,7 +639,7 @@ func TestRetryPerSubgraph(t *testing.T) {
 			RouterOptions: []core.Option{
 				options,
 			},
-		}, func(t *testing.T, xEnv *testenv.Environment) {
+		}, func(t *testing.T, _ *testenv.Environment) {
 			require.Fail(t, "expected initialization to fail due to invalid algorithm")
 		})
 
@@ -669,7 +669,7 @@ func TestRetryPerSubgraph(t *testing.T) {
 			RouterOptions: []core.Option{
 				options,
 			},
-		}, func(t *testing.T, xEnv *testenv.Environment) {
+		}, func(t *testing.T, _ *testenv.Environment) {
 			require.Fail(t, "expected initialization to fail due to invalid algorithm")
 		})
 
@@ -700,7 +700,7 @@ func TestRetryPerSubgraph(t *testing.T) {
 			RouterOptions: []core.Option{
 				options,
 			},
-		}, func(t *testing.T, xEnv *testenv.Environment) {
+		}, func(_ *testing.T, _ *testenv.Environment) {
 		})
 
 		require.NoError(t, err)
@@ -754,8 +754,8 @@ func TestRetryPerSubgraph(t *testing.T) {
 					},
 				},
 				Test1: testenv.SubgraphConfig{
-					Middleware: func(handler http.Handler) http.Handler {
-						return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+					Middleware: func(_ http.Handler) http.Handler {
+						return http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
 							w.WriteHeader(http.StatusBadGateway)
 							test1Calls.Add(1)
 						})
@@ -768,9 +768,9 @@ func TestRetryPerSubgraph(t *testing.T) {
 				Query: `query employees { employees { id } }`,
 			})
 			require.NoError(t, err)
-			require.Equal(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'employees', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"employees":null}}`, resEmp.Body)
-			require.Equal(t, int32(employeesMax+1), employeesCalls.Load())
-			require.Equal(t, int32(0), test1Calls.Load())
+			require.JSONEq(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'employees', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"employees":null}}`, resEmp.Body)
+			require.Equal(t, employeesMax+1, int(employeesCalls.Load()))
+			require.Equal(t, 0, int(test1Calls.Load()))
 
 			// 2) Call test1-only query; expect test subgraph to be retried test1Max times (attempts = retries + 1)
 			resTest1, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{
@@ -778,9 +778,9 @@ func TestRetryPerSubgraph(t *testing.T) {
 			})
 
 			require.NoError(t, err)
-			require.Equal(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'test1', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"floatField":null}}`, resTest1.Body)
-			require.Equal(t, int32(employeesMax+1), employeesCalls.Load())
-			require.Equal(t, int32(test1Max+1), test1Calls.Load())
+			require.JSONEq(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'test1', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"floatField":null}}`, resTest1.Body)
+			require.Equal(t, employeesMax+1, int(employeesCalls.Load()))
+			require.Equal(t, test1Max+1, int(test1Calls.Load()))
 		})
 	})
 
@@ -832,8 +832,8 @@ func TestRetryPerSubgraph(t *testing.T) {
 					},
 				},
 				Test1: testenv.SubgraphConfig{
-					Middleware: func(handler http.Handler) http.Handler {
-						return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+					Middleware: func(_ http.Handler) http.Handler {
+						return http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
 							w.WriteHeader(http.StatusBadGateway)
 							test1Calls.Add(1)
 						})
@@ -846,9 +846,9 @@ func TestRetryPerSubgraph(t *testing.T) {
 				Query: `query employees { employees { id } }`,
 			})
 			require.NoError(t, err)
-			require.Equal(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'employees', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"employees":null}}`, resEmp.Body)
-			require.Equal(t, int32(employeesMax+1), employeesCalls.Load())
-			require.Equal(t, int32(0), test1Calls.Load())
+			require.JSONEq(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'employees', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"employees":null}}`, resEmp.Body)
+			require.Equal(t, employeesMax+1, int(employeesCalls.Load()))
+			require.Equal(t, 0, int(test1Calls.Load()))
 
 			// 2) Call test1-only query; expect test subgraph to be retried test1Max times (attempts = retries + 1)
 			resTest1, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{
@@ -856,9 +856,9 @@ func TestRetryPerSubgraph(t *testing.T) {
 			})
 
 			require.NoError(t, err)
-			require.Equal(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'test1', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"floatField":null}}`, resTest1.Body)
-			require.Equal(t, int32(employeesMax+1), employeesCalls.Load())
-			require.Equal(t, int32(test1Max+1), test1Calls.Load())
+			require.JSONEq(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'test1', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"floatField":null}}`, resTest1.Body)
+			require.Equal(t, employeesMax+1, int(employeesCalls.Load()))
+			require.Equal(t, test1Max+1, int(test1Calls.Load()))
 		})
 	})
 
@@ -898,8 +898,8 @@ func TestRetryPerSubgraph(t *testing.T) {
 					},
 				},
 				Test1: testenv.SubgraphConfig{
-					Middleware: func(handler http.Handler) http.Handler {
-						return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+					Middleware: func(_ http.Handler) http.Handler {
+						return http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
 							w.WriteHeader(http.StatusBadGateway)
 							test1Calls.Add(1)
 						})
@@ -912,9 +912,9 @@ func TestRetryPerSubgraph(t *testing.T) {
 				Query: `query employees { employees { id } }`,
 			})
 			require.NoError(t, err)
-			require.Equal(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'employees', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"employees":null}}`, resEmp.Body)
-			require.Equal(t, int32(generalMax+1), employeesCalls.Load())
-			require.Equal(t, int32(0), test1Calls.Load())
+			require.JSONEq(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'employees', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"employees":null}}`, resEmp.Body)
+			require.Equal(t, generalMax+1, int(employeesCalls.Load()))
+			require.Equal(t, 0, int(test1Calls.Load()))
 
 			// 2) Call test1-only query; expect test subgraph to be retried test1Max times (attempts = retries + 1)
 			resTest1, err := xEnv.MakeGraphQLRequest(testenv.GraphQLRequest{
@@ -922,9 +922,9 @@ func TestRetryPerSubgraph(t *testing.T) {
 			})
 
 			require.NoError(t, err)
-			require.Equal(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'test1', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"floatField":null}}`, resTest1.Body)
-			require.Equal(t, int32(generalMax+1), employeesCalls.Load())
-			require.Equal(t, int32(generalMax+1), test1Calls.Load())
+			require.JSONEq(t, `{"errors":[{"message":"Failed to fetch from Subgraph 'test1', Reason: empty response.","extensions":{"statusCode":502}}],"data":{"floatField":null}}`, resTest1.Body)
+			require.Equal(t, generalMax+1, int(employeesCalls.Load()))
+			require.Equal(t, generalMax+1, int(test1Calls.Load()))
 		})
 	})
 
@@ -971,9 +971,9 @@ func TestRetryPerSubgraph(t *testing.T) {
 				},
 			})
 			require.NoError(t, err)
-			require.Equal(t, res.Response.Header.Get("X-Feature-Flag"), "myff")
+			require.Equal(t, "myff", res.Response.Header.Get("X-Feature-Flag"))
 			require.Contains(t, res.Body, `{"message":"Failed to fetch from Subgraph 'products_fg' at Path 'employees', Reason: empty response.","extensions":{"statusCode":502}}`)
-			require.Equal(t, int32(generalMax+1), calls.Load())
+			require.Equal(t, generalMax+1, int(calls.Load()))
 		})
 	})
 
@@ -1007,7 +1007,7 @@ func TestRetryPerSubgraph(t *testing.T) {
 			},
 			Subgraphs: testenv.SubgraphsConfig{
 				Employees: testenv.SubgraphConfig{
-					Middleware: func(handler http.Handler) http.Handler {
+					Middleware: func(_ http.Handler) http.Handler {
 						return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
 							employeesCalls.Add(1)
 							if employeesCalls.Load() <= failedTries {
@@ -1016,7 +1016,7 @@ func TestRetryPerSubgraph(t *testing.T) {
 							}
 
 							upgrader := websocket.Upgrader{
-								CheckOrigin: func(r *http.Request) bool {
+								CheckOrigin: func(_ *http.Request) bool {
 									return true
 								},
 								Subprotocols: []string{"graphql-transport-ws"},

From f3531690a8925716e825fb0d7a93ededadd14df9 Mon Sep 17 00:00:00 2001
From: Milinda Dias <skay.archon@gmail.com>
Date: Thu, 18 Sep 2025 02:08:02 +0530
Subject: [PATCH 16/24] fix: linting

---
 router/core/transport.go                      |  2 +-
 router/internal/expr/retry_expression.go      |  6 +-
 router/internal/retrytransport/manager.go     | 22 +++---
 .../retrytransport/retry_transport_test.go    | 76 +++++++++----------
 router/internal/traceclient/traceclient.go    |  8 +-
 5 files changed, 57 insertions(+), 57 deletions(-)

diff --git a/router/core/transport.go b/router/core/transport.go
index 2211936ba6..fd3bc7f251 100644
--- a/router/core/transport.go
+++ b/router/core/transport.go
@@ -103,7 +103,7 @@ func NewCustomTransport(
 	}
 
 	if enableTraceClient {
-		getExprContext := func(ctx context.Context, req *http.Request) *expr.Context {
+		getExprContext := func(ctx context.Context) *expr.Context {
 			reqContext := getRequestContext(ctx)
 			if reqContext == nil {
 				return &expr.Context{}
diff --git a/router/internal/expr/retry_expression.go b/router/internal/expr/retry_expression.go
index 0d49664dd6..c925bb87dd 100644
--- a/router/internal/expr/retry_expression.go
+++ b/router/internal/expr/retry_expression.go
@@ -22,14 +22,14 @@ func NewRetryExpressionManager() *RetryExpressionManager {
 	}
 }
 
-func (c *RetryExpressionManager) AddExpression(exprString string) error {
+func (m *RetryExpressionManager) AddExpression(exprString string) error {
 	expression := exprString
 	if expression == "" {
 		expression = defaultRetryExpression
 	}
 
 	// The expression has already been processed, skip recompilation
-	if _, ok := c.expressionMap[expression]; ok {
+	if _, ok := m.expressionMap[expression]; ok {
 		return nil
 	}
 
@@ -45,7 +45,7 @@ func (c *RetryExpressionManager) AddExpression(exprString string) error {
 	}
 
 	// Use the normalized expression string as the key for deduplication
-	c.expressionMap[expression] = program
+	m.expressionMap[expression] = program
 	return nil
 }
 
diff --git a/router/internal/retrytransport/manager.go b/router/internal/retrytransport/manager.go
index 9ca0555c3f..0b140a4cca 100644
--- a/router/internal/retrytransport/manager.go
+++ b/router/internal/retrytransport/manager.go
@@ -78,18 +78,18 @@ func (m *Manager) Initialize(baseRetryOptions RetryOptions, subgraphRetryOptions
 	if len(defaultSgNames) > 0 && baseRetryOptions.Enabled {
 		if baseRetryOptions.Algorithm != BackoffJitter {
 			return fmt.Errorf("unsupported retry algorithm: %s", baseRetryOptions.Algorithm)
-		} else {
-			err := m.exprManager.AddExpression(baseRetryOptions.Expression)
-			if err != nil {
-				return fmt.Errorf("failed to add base retry expression: %w", err)
-			} else {
-				// Only assign default options if validation succeeds
-				for _, sgName := range defaultSgNames {
-					opts := baseRetryOptions
-					m.retries[sgName] = &opts
-				}
-			}
 		}
+		
+		err := m.exprManager.AddExpression(baseRetryOptions.Expression)
+		if err != nil {
+			return fmt.Errorf("failed to add base retry expression: %w", err)
+		}
+		// Only assign default options if validation succeeds
+		for _, sgName := range defaultSgNames {
+			opts := baseRetryOptions
+			m.retries[sgName] = &opts
+		}
+
 	}
 
 	// Process custom retry options
diff --git a/router/internal/retrytransport/retry_transport_test.go b/router/internal/retrytransport/retry_transport_test.go
index e58a1228f2..54ca3b7b02 100644
--- a/router/internal/retrytransport/retry_transport_test.go
+++ b/router/internal/retrytransport/retry_transport_test.go
@@ -7,6 +7,7 @@ import (
 	"io"
 	"net/http"
 	"net/http/httptest"
+	"strconv"
 	"strings"
 	"testing"
 	"time"
@@ -21,10 +22,10 @@ import (
 
 const defaultMaxDuration = 100 * time.Second
 
-var loggerFunc = func(req *http.Request) *zap.Logger { return zap.NewNop() }
+var loggerFunc = func(_ *http.Request) *zap.Logger { return zap.NewNop() }
 
-var getActiveSubgraph = func(subgraphName string) func(req *http.Request) string {
-	return func(req *http.Request) string {
+var getActiveSubgraph = func(subgraphName string) func(_ *http.Request) string {
+	return func(_ *http.Request) string {
 		return subgraphName
 	}
 }
@@ -62,7 +63,7 @@ func (dt *MockTransport) RoundTrip(req *http.Request) (*http.Response, error) {
 }
 
 func newTestManager(shouldRetry func(error, *http.Request, *http.Response) bool, onRetry OnRetryFunc, opts RetryOptions, subgraphName string) *Manager {
-	mgr := NewManager(expr.NewRetryExpressionManager(), func(err error, req *http.Request, resp *http.Response, exprString string) bool {
+	mgr := NewManager(expr.NewRetryExpressionManager(), func(err error, req *http.Request, resp *http.Response, _ string) bool {
 		return shouldRetry(err, req, resp)
 	}, onRetry)
 	// attach options for the subgraph
@@ -76,12 +77,12 @@ func TestRetryOnHTTP5xx(t *testing.T) {
 	maxRetries := 3
 
 	subgraphName := "sg"
-	mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+	mgr := newTestManager(simpleShouldRetry, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) {
 		retries++
 	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName)
 
 	tr := NewRetryHTTPTransport(&MockTransport{
-		handler: func(req *http.Request) (*http.Response, error) {
+		handler: func(_ *http.Request) (*http.Response, error) {
 			attemptCount++
 			if attemptCount <= maxRetries {
 				// Return 500 to trigger retry
@@ -113,12 +114,12 @@ func TestRetryOnErrors(t *testing.T) {
 	maxRetries := 3
 
 	subgraphName := "sg"
-	mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+	mgr := newTestManager(simpleShouldRetry, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) {
 		retries++
 	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName)
 
 	tr := NewRetryHTTPTransport(&MockTransport{
-		handler: func(req *http.Request) (*http.Response, error) {
+		handler: func(_ *http.Request) (*http.Response, error) {
 			attemptCount++
 			if attemptCount <= maxRetries {
 				// Return any error to trigger retry
@@ -158,12 +159,12 @@ func TestDoNotRetryWhenShouldRetryReturnsFalse(t *testing.T) {
 	}
 
 	subgraphName := "sg"
-	mgr := newTestManager(shouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+	mgr := newTestManager(shouldRetry, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) {
 		retries++
 	}, RetryOptions{MaxRetryCount: maxRetryCount, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName)
 
 	tr := NewRetryHTTPTransport(&MockTransport{
-		handler: func(req *http.Request) (*http.Response, error) {
+		handler: func(_ *http.Request) (*http.Response, error) {
 			attemptCount++
 			switch attemptCount {
 			case 1:
@@ -225,12 +226,12 @@ func TestShortCircuitOnSuccess(t *testing.T) {
 	attemptCount := 0
 
 	subgraphName := "sg"
-	mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+	mgr := newTestManager(simpleShouldRetry, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) {
 		t.Error("OnRetry should not be called when first request succeeds")
 	}, RetryOptions{MaxRetryCount: 5, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName)
 
 	tr := NewRetryHTTPTransport(&MockTransport{
-		handler: func(req *http.Request) (*http.Response, error) {
+		handler: func(_ *http.Request) (*http.Response, error) {
 			attemptCount++
 			// Always return success
 			return &http.Response{
@@ -260,12 +261,12 @@ func TestMaxRetryCountRespected(t *testing.T) {
 	attemptCount := 0
 
 	subgraphName := "sg"
-	mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+	mgr := newTestManager(simpleShouldRetry, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) {
 		retries++
 	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName)
 
 	tr := NewRetryHTTPTransport(&MockTransport{
-		handler: func(req *http.Request) (*http.Response, error) {
+		handler: func(_ *http.Request) (*http.Response, error) {
 			attemptCount++
 			// Always return retryable error to test max retry limit
 			return nil, errors.New("always fail")
@@ -298,24 +299,24 @@ func TestResponseBodyDraining(t *testing.T) {
 	}
 
 	subgraphName := "sg"
-	mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+	mgr := newTestManager(simpleShouldRetry, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) {
 		actualRetries++
 	}, RetryOptions{MaxRetryCount: retryCount, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName)
 
 	tr := NewRetryHTTPTransport(&MockTransport{
-		handler: func(req *http.Request) (*http.Response, error) {
+		handler: func(_ *http.Request) (*http.Response, error) {
 			index++
 			if index < retryCount {
 				return &http.Response{
 					StatusCode: http.StatusInternalServerError,
 					Body:       bodies[index],
 				}, nil
-			} else {
-				return &http.Response{
-					StatusCode: http.StatusOK,
-					Body:       bodies[index],
-				}, nil
 			}
+			return &http.Response{
+				StatusCode: http.StatusOK,
+				Body:       bodies[index],
+			}, nil
+
 		},
 	}, loggerFunc, mgr, getActiveSubgraph(subgraphName))
 
@@ -364,24 +365,23 @@ func TestRequestLoggerIsUsed(t *testing.T) {
 	}
 
 	subgraphName := "sg"
-	mgr := newTestManager(simpleShouldRetry, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+	mgr := newTestManager(simpleShouldRetry, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) {
 		actualRetries++
 	}, RetryOptions{MaxRetryCount: retryCount, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName)
 
 	tr := NewRetryHTTPTransport(&MockTransport{
-		handler: func(req *http.Request) (*http.Response, error) {
+		handler: func(_ *http.Request) (*http.Response, error) {
 			index++
 			if index < retryCount {
 				return &http.Response{
 					StatusCode: http.StatusInternalServerError,
 					Body:       bodies[index],
 				}, nil
-			} else {
-				return &http.Response{
-					StatusCode: http.StatusOK,
-					Body:       bodies[index],
-				}, nil
 			}
+			return &http.Response{
+				StatusCode: http.StatusOK,
+				Body:       bodies[index],
+			}, nil
 		},
 	}, func(req *http.Request) *zap.Logger { return requestLogger }, mgr, getActiveSubgraph(subgraphName))
 
@@ -468,7 +468,7 @@ func TestRetryOn429WithDelaySeconds(t *testing.T) {
 	var retryAfterUsed []bool
 
 	subgraphName := `sg`
-	mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+	mgr := newTestManager(shouldRetryWith429, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) {
 		retries++
 	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 100 * time.Millisecond, MaxDuration: 10 * time.Second}, subgraphName)
 
@@ -481,7 +481,7 @@ func TestRetryOn429WithDelaySeconds(t *testing.T) {
 					StatusCode: http.StatusTooManyRequests,
 					Header:     make(http.Header),
 				}
-				resp.Header.Set("Retry-After", fmt.Sprintf("%d", retryAfterSeconds))
+				resp.Header.Set("Retry-After", strconv.Itoa(retryAfterSeconds))
 
 				// Verify the header is parsed correctly
 				duration, useRetryAfter := shouldUseRetryAfter(zap.NewNop(), resp, defaultMaxDuration)
@@ -523,7 +523,7 @@ func TestRetryOn429WithDelaySecondsLargerThanMaxDuration(t *testing.T) {
 	var retryAfterUsed []bool
 
 	subgraphName := "sg"
-	mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+	mgr := newTestManager(shouldRetryWith429, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) {
 		retries++
 	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 100 * time.Millisecond, MaxDuration: 10 * time.Second}, subgraphName)
 
@@ -536,7 +536,7 @@ func TestRetryOn429WithDelaySecondsLargerThanMaxDuration(t *testing.T) {
 					StatusCode: http.StatusTooManyRequests,
 					Header:     make(http.Header),
 				}
-				resp.Header.Set("Retry-After", fmt.Sprintf("%d", retryAfterSeconds))
+				resp.Header.Set("Retry-After", strconv.Itoa(retryAfterSeconds))
 
 				// Verify the header is parsed correctly
 				duration, useRetryAfter := shouldUseRetryAfter(zap.NewNop(), resp, maxDuration)
@@ -573,7 +573,7 @@ func TestRetryOn429WithoutRetryAfter(t *testing.T) {
 	maxRetries := 2
 
 	subgraphName := "sg"
-	mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+	mgr := newTestManager(shouldRetryWith429, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) {
 		retries++
 	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName)
 
@@ -614,7 +614,7 @@ func TestRetryOn429WithHTTPDate(t *testing.T) {
 	var expectedDuration time.Duration
 
 	subgraphName := "sg"
-	mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+	mgr := newTestManager(shouldRetryWith429, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) {
 		retries++
 	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 100 * time.Millisecond, MaxDuration: 10 * time.Second}, subgraphName)
 
@@ -668,7 +668,7 @@ func TestRetryOn429WithInvalidRetryAfterHeader(t *testing.T) {
 	maxRetries := 2
 
 	subgraphName := "sg"
-	mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+	mgr := newTestManager(shouldRetryWith429, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) {
 		retries++
 	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName)
 
@@ -708,7 +708,7 @@ func TestRetryOn429WithNegativeDelaySeconds(t *testing.T) {
 	maxRetries := 2
 
 	subgraphName := "sg"
-	mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+	mgr := newTestManager(shouldRetryWith429, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) {
 		retries++
 	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName)
 
@@ -750,7 +750,7 @@ func TestRetryMixed429AndOtherErrors(t *testing.T) {
 	var retryAfterUsedPerAttempt []bool
 
 	subgraphName := "sg"
-	mgr := newTestManager(shouldRetryWith429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+	mgr := newTestManager(shouldRetryWith429, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) {
 		retries++
 	}, RetryOptions{MaxRetryCount: maxRetries, Interval: 10 * time.Millisecond, MaxDuration: 10 * time.Second}, subgraphName)
 
@@ -831,7 +831,7 @@ func TestNoRetryOn429WhenShouldRetryReturnsFalse(t *testing.T) {
 	}
 
 	subgraphName := `sg`
-	mgr := newTestManager(shouldNotRetry429, func(count int, req *http.Request, resp *http.Response, sleepDuration time.Duration, err error) {
+	mgr := newTestManager(shouldNotRetry429, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) {
 		retries++
 	}, RetryOptions{MaxRetryCount: 3, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName)
 
diff --git a/router/internal/traceclient/traceclient.go b/router/internal/traceclient/traceclient.go
index 3207e3697f..53cd7236a8 100644
--- a/router/internal/traceclient/traceclient.go
+++ b/router/internal/traceclient/traceclient.go
@@ -34,14 +34,14 @@ type ClientTraceContextKey struct{}
 type TraceInjectingRoundTripper struct {
 	base                  http.RoundTripper
 	connectionMetricStore metric.ConnectionMetricStore
-	getExprContext        func(ctx context.Context, req *http.Request) *expr.Context
+	getExprContext        func(ctx context.Context) *expr.Context
 	getActiveSubgraphName func(req *http.Request) string
 }
 
 func NewTraceInjectingRoundTripper(
 	base http.RoundTripper,
 	connectionMetricStore metric.ConnectionMetricStore,
-	getExprContext func(ctx context.Context, req *http.Request) *expr.Context,
+	getExprContext func(ctx context.Context) *expr.Context,
 	getActiveSubgraphName func(req *http.Request) string,
 ) *TraceInjectingRoundTripper {
 	return &TraceInjectingRoundTripper{
@@ -106,14 +106,14 @@ func (t *TraceInjectingRoundTripper) getClientTrace(ctx context.Context) *httptr
 func (t *TraceInjectingRoundTripper) processConnectionMetrics(ctx context.Context, req *http.Request) {
 	trace := GetClientTraceFromContext(ctx)
 
-	exprContext := t.getExprContext(ctx, req)
+	exprContext := t.getExprContext(ctx)
 
 	if trace.ConnectionGet != nil && trace.ConnectionAcquired != nil {
 		duration := trace.ConnectionAcquired.Time.Sub(trace.ConnectionGet.Time)
 		exprContext.Subgraph.Request.ClientTrace.ConnectionAcquireDuration = duration
 
 		subgraph := t.getActiveSubgraphName(req)
-		
+
 		serverAttributes := rotel.GetServerAttributes(trace.ConnectionGet.HostPort)
 		serverAttributes = append(
 			serverAttributes,

From 72c826870ee450233cb166e82255e3ddf0672c8d Mon Sep 17 00:00:00 2001
From: Milinda Dias <skay.archon@gmail.com>
Date: Thu, 18 Sep 2025 02:56:20 +0530
Subject: [PATCH 17/24] fix: ci tests

---
 router-tests/circuit_breaker_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/router-tests/circuit_breaker_test.go b/router-tests/circuit_breaker_test.go
index b6d83f59b1..b7b0a8f4d1 100644
--- a/router-tests/circuit_breaker_test.go
+++ b/router-tests/circuit_breaker_test.go
@@ -724,7 +724,7 @@ func TestCircuitBreaker(t *testing.T) {
 			require.Equal(t, failedTries, employeesCalls.Load())
 
 			// Wait for current bucket to be cleaned up
-			time.Sleep(breaker.RollingDuration + time.Millisecond*500)
+			time.Sleep(breaker.RollingDuration + time.Millisecond*2000)
 
 			// ====
 			// Verify a success case with messages validated from here onwards

From db1964c1fdaea1b24566aafe27aaa7feb5bf9328 Mon Sep 17 00:00:00 2001
From: Milinda Dias <skay.archon@gmail.com>
Date: Thu, 18 Sep 2025 03:10:19 +0530
Subject: [PATCH 18/24] fix: increase duration

---
 router-tests/circuit_breaker_test.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/router-tests/circuit_breaker_test.go b/router-tests/circuit_breaker_test.go
index b7b0a8f4d1..ee5e4a4e73 100644
--- a/router-tests/circuit_breaker_test.go
+++ b/router-tests/circuit_breaker_test.go
@@ -724,7 +724,7 @@ func TestCircuitBreaker(t *testing.T) {
 			require.Equal(t, failedTries, employeesCalls.Load())
 
 			// Wait for current bucket to be cleaned up
-			time.Sleep(breaker.RollingDuration + time.Millisecond*2000)
+			time.Sleep(breaker.RollingDuration*3 + time.Millisecond*1000)
 
 			// ====
 			// Verify a success case with messages validated from here onwards

From f37ac89baecf76e78fbc2f83a5f6ea34e9a8264c Mon Sep 17 00:00:00 2001
From: Milinda Dias <skay.archon@gmail.com>
Date: Tue, 7 Apr 2026 18:21:13 +0530
Subject: [PATCH 19/24] fix: gofmt

---
 router/internal/retrytransport/manager.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/router/internal/retrytransport/manager.go b/router/internal/retrytransport/manager.go
index 0b140a4cca..576358937d 100644
--- a/router/internal/retrytransport/manager.go
+++ b/router/internal/retrytransport/manager.go
@@ -79,7 +79,7 @@ func (m *Manager) Initialize(baseRetryOptions RetryOptions, subgraphRetryOptions
 		if baseRetryOptions.Algorithm != BackoffJitter {
 			return fmt.Errorf("unsupported retry algorithm: %s", baseRetryOptions.Algorithm)
 		}
-		
+
 		err := m.exprManager.AddExpression(baseRetryOptions.Expression)
 		if err != nil {
 			return fmt.Errorf("failed to add base retry expression: %w", err)

From a4d460baa8f8e55f4af7a7a31269ca69097ed7fb Mon Sep 17 00:00:00 2001
From: Milinda Dias <skay.archon@gmail.com>
Date: Tue, 7 Apr 2026 19:19:10 +0530
Subject: [PATCH 20/24] fix: circuit breaker tests

---
 router-tests/security/circuit_breaker_test.go | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/router-tests/security/circuit_breaker_test.go b/router-tests/security/circuit_breaker_test.go
index e521b6bd65..efbc59a7a2 100644
--- a/router-tests/security/circuit_breaker_test.go
+++ b/router-tests/security/circuit_breaker_test.go
@@ -641,8 +641,9 @@ func TestCircuitBreaker(t *testing.T) {
 
 		breaker.NumBuckets = 1
 		breaker.RollingDuration = 5000 * time.Millisecond
+		breaker.ExecutionTimeout = 30 * time.Second
 
-		trafficConfig := getTrafficConfigWithTimeout(breaker, 1*time.Second)
+		trafficConfig := getTrafficConfigWithTimeout(breaker, 30*time.Second)
 
 		employeesCalls := atomic.Int64{}
 

From be2b7c90518b8296775d3ba4de7be1afec617e95 Mon Sep 17 00:00:00 2001
From: Milinda Dias <skay.archon@gmail.com>
Date: Tue, 7 Apr 2026 19:19:31 +0530
Subject: [PATCH 21/24] fix: tests

---
 router/internal/retrytransport/retry_transport.go | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/router/internal/retrytransport/retry_transport.go b/router/internal/retrytransport/retry_transport.go
index 93746b3e2f..a2e3e69af6 100644
--- a/router/internal/retrytransport/retry_transport.go
+++ b/router/internal/retrytransport/retry_transport.go
@@ -170,6 +170,10 @@ func (rt *RetryHTTPTransport) drainBody(resp *http.Response, logger *zap.Logger)
 }
 
 func isResponseOK(resp *http.Response) bool {
+	// 101 Switching Protocols is a successful response for WebSocket upgrades
+	if resp.StatusCode == http.StatusSwitchingProtocols {
+		return true
+	}
 	// Ensure we don't wait for no reason when subgraphs don't behave
 	// spec-compliant and returns a different status code than 200.
 	return resp.StatusCode >= 200 && resp.StatusCode < 300

From dfb90dd8acd762106f43101afe078be6c43d0fc6 Mon Sep 17 00:00:00 2001
From: Milinda Dias <skay.archon@gmail.com>
Date: Tue, 7 Apr 2026 20:12:39 +0530
Subject: [PATCH 22/24] fix: tests

---
 router-tests/security/circuit_breaker_test.go | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/router-tests/security/circuit_breaker_test.go b/router-tests/security/circuit_breaker_test.go
index efbc59a7a2..c1b6d633bd 100644
--- a/router-tests/security/circuit_breaker_test.go
+++ b/router-tests/security/circuit_breaker_test.go
@@ -726,6 +726,11 @@ func TestCircuitBreaker(t *testing.T) {
 
 			require.Equal(t, failedTries, employeesCalls.Load())
 
+			// Ensure all previous subscriptions are fully cleaned up before
+			// waiting for the circuit to reset, to prevent leftover subscription
+			// cleanup from interfering with the half-open circuit state.
+			xEnv.WaitForSubscriptionCount(0, time.Second*5)
+
 			// Wait for current bucket to be cleaned up
 			time.Sleep(breaker.RollingDuration*3 + time.Millisecond*1000)
 
@@ -733,7 +738,6 @@ func TestCircuitBreaker(t *testing.T) {
 			// Verify a success case with messages validated from here onwards
 			// ====
 
-			// Sending a complete must stop the subscription
 			conn := xEnv.InitGraphQLWebSocketConnection(nil, nil, nil)
 			err := testenv.WSWriteJSON(t, conn, &testenv.WebSocketMessage{
 				ID:      "1",
@@ -744,6 +748,12 @@ func TestCircuitBreaker(t *testing.T) {
 
 			_, message, err := testenv.WSReadMessage(t, conn)
 			require.NoError(t, err)
+
+			t.Logf("employeesCalls after recovery: %d", employeesCalls.Load())
+			t.Logf("received message: %s", string(message))
+			t.Logf("circuit breaker status changed count: %d", xEnv.Observer().FilterMessage("Circuit breaker status changed").Len())
+			t.Logf("circuit breaker open count: %d", xEnv.Observer().FilterMessage("Circuit breaker open, request callback did not execute").Len())
+
 			require.JSONEq(t, timestampMessage, string(message))
 
 			err = testenv.WSWriteJSON(t, conn, &testenv.WebSocketMessage{ID: "1", Type: "complete"})

From 4f5265294b4988f533944a218282c9c36083b232 Mon Sep 17 00:00:00 2001
From: Milinda Dias <skay.archon@gmail.com>
Date: Tue, 21 Apr 2026 19:39:06 +0530
Subject: [PATCH 23/24] fix: changes

---
 router-tests/security/circuit_breaker_test.go | 14 ++++++--------
 router-tests/security/retry_test.go           |  5 +++++
 router-tests/subscriptions/websocket_test.go  | 13 -------------
 3 files changed, 11 insertions(+), 21 deletions(-)

diff --git a/router-tests/security/circuit_breaker_test.go b/router-tests/security/circuit_breaker_test.go
index c1b6d633bd..babbd19d70 100644
--- a/router-tests/security/circuit_breaker_test.go
+++ b/router-tests/security/circuit_breaker_test.go
@@ -641,9 +641,7 @@ func TestCircuitBreaker(t *testing.T) {
 
 		breaker.NumBuckets = 1
 		breaker.RollingDuration = 5000 * time.Millisecond
-		breaker.ExecutionTimeout = 30 * time.Second
-
-		trafficConfig := getTrafficConfigWithTimeout(breaker, 30*time.Second)
+		trafficConfig := getTrafficConfigWithTimeout(breaker, 1*time.Second)
 
 		employeesCalls := atomic.Int64{}
 
@@ -681,12 +679,17 @@ func TestCircuitBreaker(t *testing.T) {
 								_ = conn.Close()
 							}()
 
+							// Read connection_init
 							_, _, err = testenv.WSReadMessage(t, conn)
 							require.NoError(t, err)
 
 							err = testenv.WSWriteMessage(t, conn, websocket.TextMessage, []byte(`{"type":"connection_ack"}`))
 							require.NoError(t, err)
 
+							// Read subscribe message before sending data
+							_, _, err = testenv.WSReadMessage(t, conn)
+							require.NoError(t, err)
+
 							err = testenv.WSWriteMessage(t, conn, websocket.TextMessage, []byte(timestampMessage))
 							require.NoError(t, err)
 
@@ -749,11 +752,6 @@ func TestCircuitBreaker(t *testing.T) {
 			_, message, err := testenv.WSReadMessage(t, conn)
 			require.NoError(t, err)
 
-			t.Logf("employeesCalls after recovery: %d", employeesCalls.Load())
-			t.Logf("received message: %s", string(message))
-			t.Logf("circuit breaker status changed count: %d", xEnv.Observer().FilterMessage("Circuit breaker status changed").Len())
-			t.Logf("circuit breaker open count: %d", xEnv.Observer().FilterMessage("Circuit breaker open, request callback did not execute").Len())
-
 			require.JSONEq(t, timestampMessage, string(message))
 
 			err = testenv.WSWriteJSON(t, conn, &testenv.WebSocketMessage{ID: "1", Type: "complete"})
diff --git a/router-tests/security/retry_test.go b/router-tests/security/retry_test.go
index a679a643db..2452d0d73d 100644
--- a/router-tests/security/retry_test.go
+++ b/router-tests/security/retry_test.go
@@ -1027,12 +1027,17 @@ func TestRetryPerSubgraph(t *testing.T) {
 								_ = conn.Close()
 							}()
 
+							// Read connection_init
 							_, _, err = testenv.WSReadMessage(t, conn)
 							require.NoError(t, err)
 
 							err = testenv.WSWriteMessage(t, conn, websocket.TextMessage, []byte(`{"type":"connection_ack"}`))
 							require.NoError(t, err)
 
+							// Read subscribe message before sending data
+							_, _, err = testenv.WSReadMessage(t, conn)
+							require.NoError(t, err)
+
 							err = testenv.WSWriteMessage(t, conn, websocket.TextMessage, []byte(timestampMessage))
 							require.NoError(t, err)
 
diff --git a/router-tests/subscriptions/websocket_test.go b/router-tests/subscriptions/websocket_test.go
index 029e544b88..981d72fe2c 100644
--- a/router-tests/subscriptions/websocket_test.go
+++ b/router-tests/subscriptions/websocket_test.go
@@ -741,18 +741,6 @@ func TestWebSockets(t *testing.T) {
 	t.Run("empty allow lists should allow all headers and query args", func(t *testing.T) {
 		t.Parallel()
 
-		opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{
-			All: config.GlobalSubgraphRequestRule{
-				BackoffJitterRetry: config.BackoffJitterRetry{
-					Enabled:     true,
-					MaxAttempts: 5,
-					MaxDuration: 10 * time.Second,
-					Interval:    200 * time.Millisecond,
-					Expression:  "true",
-				},
-			},
-		})
-
 		headerRules := config.HeaderRules{
 			All: &config.GlobalHeaderRule{
 				Request: []*config.RequestHeaderRule{
@@ -781,7 +769,6 @@ func TestWebSockets(t *testing.T) {
 			},
 			RouterOptions: []core.Option{
 				core.WithHeaderRules(headerRules),
-				core.WithSubgraphRetryOptions(opts),
 			},
 			Subgraphs: testenv.SubgraphsConfig{
 				Employees: testenv.SubgraphConfig{

From 65b1db84e08d1d72257460595b03e87bc743a5ba Mon Sep 17 00:00:00 2001
From: Milinda Dias <skay.archon@gmail.com>
Date: Tue, 21 Apr 2026 20:42:18 +0530
Subject: [PATCH 24/24] fix: retry updates

---
 router-tests/security/retry_test.go           |  3 +-
 router/core/graph_server.go                   | 13 +---
 router/core/retry_builder.go                  |  4 +-
 router/core/retry_builder_test.go             | 45 +++++--------
 router/internal/retrytransport/manager.go     | 44 +++++++------
 .../retrytransport/retry_transport.go         |  4 +-
 .../retrytransport/retry_transport_test.go    | 63 ++++++++++++++++++-
 7 files changed, 107 insertions(+), 69 deletions(-)

diff --git a/router-tests/security/retry_test.go b/router-tests/security/retry_test.go
index 2452d0d73d..7939240891 100644
--- a/router-tests/security/retry_test.go
+++ b/router-tests/security/retry_test.go
@@ -928,12 +928,11 @@ func TestRetryPerSubgraph(t *testing.T) {
 		})
 	})
 
-	t.Run("verify retries are applied when only all and no subgraph specific overrides are present", func(t *testing.T) {
+	t.Run("verify retries are applied on feature flags when only all and no subgraph specific overrides are present", func(t *testing.T) {
 		t.Parallel()
 
 		calls := atomic.Int32{}
 
-		// Configure per-subgraph retry: employees gets 3 retries, test1 gets 1 retry
 		generalMax := 5
 		opts := core.NewSubgraphRetryOptions(config.TrafficShapingRules{
 			All: config.GlobalSubgraphRequestRule{
diff --git a/router/core/graph_server.go b/router/core/graph_server.go
index d7634bdc34..8b313c17aa 100644
--- a/router/core/graph_server.go
+++ b/router/core/graph_server.go
@@ -298,20 +298,13 @@ func newGraphServer(ctx context.Context, r *Router, routerConfig *nodev1.RouterC
 
 	if s.retryOptions.IsEnabled() {
 		retryExprManager := expr.NewRetryExpressionManager()
-		// Build retry options and handle any expression compilation errors
-		shouldRetryFunc, err := BuildRetryFunction(retryExprManager)
-		if err != nil {
-			return nil, fmt.Errorf("failed to build retry function: %w", err)
-		}
-
-		retryManager := retrytransport.NewManager(retryExprManager, shouldRetryFunc, s.retryOptions.OnRetryFunc)
+		retryManager := retrytransport.NewManager(retryExprManager, BuildRetryFunction(retryExprManager), s.retryOptions.OnRetryFunc, s.logger)
 
-		err = retryManager.Initialize(
+		if err := retryManager.Initialize(
 			s.retryOptions.All,
 			s.retryOptions.SubgraphMap,
 			routerConfig,
-		)
-		if err != nil {
+		); err != nil {
 			return nil, err
 		}
 
diff --git a/router/core/retry_builder.go b/router/core/retry_builder.go
index e563b58d3a..00ce7443ca 100644
--- a/router/core/retry_builder.go
+++ b/router/core/retry_builder.go
@@ -10,7 +10,7 @@ import (
 )
 
 // BuildRetryFunction creates a ShouldRetry function based on the provided expression
-func BuildRetryFunction(manager *expr.RetryExpressionManager) (retrytransport.ShouldRetryFunc, error) {
+func BuildRetryFunction(manager *expr.RetryExpressionManager) retrytransport.ShouldRetryFunc {
 	return func(err error, req *http.Request, resp *http.Response, expression string) bool {
 		reqContext := getRequestContext(req.Context())
 		if reqContext == nil {
@@ -42,7 +42,7 @@ func BuildRetryFunction(manager *expr.RetryExpressionManager) (retrytransport.Sh
 		}
 
 		return shouldRetry
-	}, nil
+	}
 }
 
 // isDefaultRetryableError checks for errors that should always be retryable
diff --git a/router/core/retry_builder_test.go b/router/core/retry_builder_test.go
index 7bfeda9fcd..ecf5b4662f 100644
--- a/router/core/retry_builder_test.go
+++ b/router/core/retry_builder_test.go
@@ -51,8 +51,7 @@ func createRequestWithContext(opType string) (*http.Request, *requestContext) {
 func TestBuildRetryFunction(t *testing.T) {
 	t.Run("build function when retry is disabled", func(t *testing.T) {
 		manager := expr.NewRetryExpressionManager()
-		fn, err := BuildRetryFunction(manager)
-		assert.NoError(t, err)
+		fn := BuildRetryFunction(manager)
 		assert.NotNil(t, fn)
 	})
 
@@ -62,8 +61,7 @@ func TestBuildRetryFunction(t *testing.T) {
 		err := manager.AddExpression("")
 		assert.NoError(t, err)
 
-		fn, err := BuildRetryFunction(manager)
-		assert.NoError(t, err)
+		fn := BuildRetryFunction(manager)
 		assert.NotNil(t, fn)
 
 		// Create a request with proper query context
@@ -90,8 +88,7 @@ func TestBuildRetryFunction(t *testing.T) {
 		err := manager.AddExpression(expression)
 		assert.NoError(t, err)
 
-		fn, err := BuildRetryFunction(manager)
-		assert.NoError(t, err)
+		fn := BuildRetryFunction(manager)
 		assert.NotNil(t, fn)
 
 		// Create a request with proper query context
@@ -116,8 +113,7 @@ func TestBuildRetryFunction(t *testing.T) {
 		err := manager.AddExpression(expression)
 		assert.NoError(t, err)
 
-		fn, err := BuildRetryFunction(manager)
-		assert.NoError(t, err)
+		fn := BuildRetryFunction(manager)
 		assert.NotNil(t, fn)
 
 		// Create a request with proper query context
@@ -149,8 +145,7 @@ func TestBuildRetryFunction(t *testing.T) {
 		err := manager.AddExpression("")
 		assert.NoError(t, err)
 
-		fn, err := BuildRetryFunction(manager)
-		assert.NoError(t, err)
+		fn := BuildRetryFunction(manager)
 		assert.NotNil(t, fn)
 
 		// Create a request with proper query context
@@ -179,8 +174,7 @@ func TestBuildRetryFunction(t *testing.T) {
 		err := manager.AddExpression(expression)
 		assert.NoError(t, err)
 
-		fn, err := BuildRetryFunction(manager)
-		assert.NoError(t, err)
+		fn := BuildRetryFunction(manager)
 		assert.NotNil(t, fn)
 
 		// Create a request with proper query context
@@ -195,8 +189,7 @@ func TestBuildRetryFunction(t *testing.T) {
 		err := manager.AddExpression(expression)
 		assert.NoError(t, err)
 
-		fn, err := BuildRetryFunction(manager)
-		assert.NoError(t, err)
+		fn := BuildRetryFunction(manager)
 		assert.NotNil(t, fn)
 
 		// Create a request with proper query context
@@ -217,8 +210,7 @@ func TestBuildRetryFunction(t *testing.T) {
 		err := manager.AddExpression(expression)
 		assert.NoError(t, err)
 
-		fn, err := BuildRetryFunction(manager)
-		assert.NoError(t, err)
+		fn := BuildRetryFunction(manager)
 		assert.NotNil(t, fn)
 
 		// Create a request with proper query context
@@ -245,8 +237,7 @@ func TestBuildRetryFunction(t *testing.T) {
 		err := manager.AddExpression(expression)
 		assert.NoError(t, err)
 
-		fn, err := BuildRetryFunction(manager)
-		assert.NoError(t, err)
+		fn := BuildRetryFunction(manager)
 		assert.NotNil(t, fn)
 
 		// Create a request with mutation context
@@ -275,8 +266,7 @@ func TestBuildRetryFunction(t *testing.T) {
 		err := manager.AddExpression(expression)
 		assert.NoError(t, err)
 
-		fn, err := BuildRetryFunction(manager)
-		assert.NoError(t, err)
+		fn := BuildRetryFunction(manager)
 		assert.NotNil(t, fn)
 
 		// Create a request with query context
@@ -300,8 +290,7 @@ func TestBuildRetryFunction(t *testing.T) {
 		err := manager.AddExpression(expression)
 		assert.NoError(t, err)
 
-		fn, err := BuildRetryFunction(manager)
-		assert.NoError(t, err)
+		fn := BuildRetryFunction(manager)
 		assert.NotNil(t, fn)
 
 		// Create a request with subscription context
@@ -323,8 +312,7 @@ func TestBuildRetryFunction(t *testing.T) {
 		err := manager.AddExpression(expression)
 		assert.NoError(t, err)
 
-		fn, err := BuildRetryFunction(manager)
-		assert.NoError(t, err)
+		fn := BuildRetryFunction(manager)
 		assert.NotNil(t, fn)
 
 		// Create request with proper context
@@ -344,8 +332,7 @@ func TestBuildRetryFunction(t *testing.T) {
 		err := manager.AddExpression(expression)
 		assert.NoError(t, err)
 
-		fn, err := BuildRetryFunction(manager)
-		assert.NoError(t, err)
+		fn := BuildRetryFunction(manager)
 		assert.NotNil(t, fn)
 
 		// Create request with proper query context
@@ -366,8 +353,7 @@ func TestBuildRetryFunction(t *testing.T) {
 		err := manager.AddExpression(expression)
 		assert.NoError(t, err)
 
-		fn, err := BuildRetryFunction(manager)
-		assert.NoError(t, err)
+		fn := BuildRetryFunction(manager)
 		assert.NotNil(t, fn)
 
 		// Create a request with mutation context
@@ -394,8 +380,7 @@ func TestBuildRetryFunction(t *testing.T) {
 		err := manager.AddExpression(expression)
 		assert.NoError(t, err)
 
-		fn, err := BuildRetryFunction(manager)
-		assert.NoError(t, err)
+		fn := BuildRetryFunction(manager)
 		assert.NotNil(t, fn)
 
 		// Test query operation - should retry on all conditions
diff --git a/router/internal/retrytransport/manager.go b/router/internal/retrytransport/manager.go
index 576358937d..0271a39706 100644
--- a/router/internal/retrytransport/manager.go
+++ b/router/internal/retrytransport/manager.go
@@ -3,7 +3,6 @@ package retrytransport
 import (
 	"fmt"
 	"net/http"
-	"sync"
 	"time"
 
 	nodev1 "github.com/wundergraph/cosmo/router/gen/proto/wg/cosmo/node/v1"
@@ -32,18 +31,22 @@ type RetryOptions struct {
 
 type Manager struct {
 	retries     map[string]*RetryOptions
-	lock        sync.RWMutex
 	exprManager *expr.RetryExpressionManager
 	retryFunc   ShouldRetryFunc
-	OnRetry     OnRetryFunc
+	onRetry     OnRetryFunc
+	logger      *zap.Logger
 }
 
-func NewManager(exprManager *expr.RetryExpressionManager, retryFunc ShouldRetryFunc, onRetryFunc OnRetryFunc) *Manager {
+func NewManager(exprManager *expr.RetryExpressionManager, retryFunc ShouldRetryFunc, onRetryFunc OnRetryFunc, logger *zap.Logger) *Manager {
+	if logger == nil {
+		logger = zap.NewNop()
+	}
 	return &Manager{
 		retries:     make(map[string]*RetryOptions),
 		exprManager: exprManager,
 		retryFunc:   retryFunc,
-		OnRetry:     onRetryFunc,
+		onRetry:     onRetryFunc,
+		logger:      logger,
 	}
 }
 
@@ -61,6 +64,17 @@ func (m *Manager) Initialize(baseRetryOptions RetryOptions, subgraphRetryOptions
 		}
 	}
 
+	// Warn on retry configs pointing at subgraphs that don't exist in the
+	// router config — likely a typo that would otherwise silently disable
+	// the override.
+	for sgName := range subgraphRetryOptions {
+		if !subgraphNameSet[sgName] {
+			m.logger.Warn("Retry config references unknown subgraph; override will be ignored",
+				zap.String("subgraph_name", sgName),
+			)
+		}
+	}
+
 	defaultSgNames := make([]string, 0, len(subgraphNameSet))
 	customSgNames := make([]string, 0, len(subgraphNameSet))
 
@@ -80,6 +94,8 @@ func (m *Manager) Initialize(baseRetryOptions RetryOptions, subgraphRetryOptions
 			return fmt.Errorf("unsupported retry algorithm: %s", baseRetryOptions.Algorithm)
 		}
 
+		// There is a chance that this is not evaluated if all defaultSgNames == 0, and only will
+		// then error out when its > 0 there
 		err := m.exprManager.AddExpression(baseRetryOptions.Expression)
 		if err != nil {
 			return fmt.Errorf("failed to add base retry expression: %w", err)
@@ -89,18 +105,17 @@ func (m *Manager) Initialize(baseRetryOptions RetryOptions, subgraphRetryOptions
 			opts := baseRetryOptions
 			m.retries[sgName] = &opts
 		}
-
 	}
 
 	// Process custom retry options
 	for _, sgName := range customSgNames {
 		entry, ok := subgraphRetryOptions[sgName]
 		if !ok {
-			return fmt.Errorf("failed to add base retry expression: %s", sgName)
+			return fmt.Errorf("failed to get subgraphRetryOptions: %s", sgName)
 		}
 
 		if entry.Algorithm != BackoffJitter {
-			return fmt.Errorf("unsupported retry algorithm for subgraph %s: %s", sgName, baseRetryOptions.Algorithm)
+			return fmt.Errorf("unsupported retry algorithm for subgraph %s: %s", sgName, entry.Algorithm)
 		}
 
 		// Validate expression before assigning options
@@ -121,24 +136,13 @@ func (m *Manager) GetSubgraphOptions(name string) *RetryOptions {
 	if m == nil {
 		return nil
 	}
-
-	m.lock.RLock()
-	defer m.lock.RUnlock()
-
-	if retryOptions, ok := m.retries[name]; ok {
-		return retryOptions
-	}
-	return nil
+	return m.retries[name]
 }
 
 func (m *Manager) IsEnabled() bool {
 	if m == nil {
 		return false
 	}
-
-	m.lock.RLock()
-	defer m.lock.RUnlock()
-
 	return len(m.retries) > 0
 }
 
diff --git a/router/internal/retrytransport/retry_transport.go b/router/internal/retrytransport/retry_transport.go
index a2e3e69af6..9d9e452497 100644
--- a/router/internal/retrytransport/retry_transport.go
+++ b/router/internal/retrytransport/retry_transport.go
@@ -126,8 +126,8 @@ func (rt *RetryHTTPTransport) RoundTrip(req *http.Request) (*http.Response, erro
 		}
 
 		// A hook used for testing
-		if rt.retryManager.OnRetry != nil {
-			rt.retryManager.OnRetry(retries, req, resp, sleepDuration, err)
+		if rt.retryManager.onRetry != nil {
+			rt.retryManager.onRetry(retries, req, resp, sleepDuration, err)
 		}
 
 		// Wait for the specified duration
diff --git a/router/internal/retrytransport/retry_transport_test.go b/router/internal/retrytransport/retry_transport_test.go
index 81877be9af..41328c37ce 100644
--- a/router/internal/retrytransport/retry_transport_test.go
+++ b/router/internal/retrytransport/retry_transport_test.go
@@ -13,10 +13,13 @@ import (
 	"time"
 
 	"go.uber.org/zap/zapcore"
+	"go.uber.org/zap/zaptest/observer"
 
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 	"go.uber.org/zap"
 
+	nodev1 "github.com/wundergraph/cosmo/router/gen/proto/wg/cosmo/node/v1"
 	"github.com/wundergraph/cosmo/router/internal/expr"
 )
 
@@ -65,7 +68,7 @@ func (dt *MockTransport) RoundTrip(req *http.Request) (*http.Response, error) {
 func newTestManager(shouldRetry func(error, *http.Request, *http.Response) bool, onRetry OnRetryFunc, opts RetryOptions, subgraphName string) *Manager {
 	mgr := NewManager(expr.NewRetryExpressionManager(), func(err error, req *http.Request, resp *http.Response, _ string) bool {
 		return shouldRetry(err, req, resp)
-	}, onRetry)
+	}, onRetry, zap.NewNop())
 	// attach options for the subgraph
 	mgr.retries[subgraphName] = &opts
 	return mgr
@@ -227,7 +230,7 @@ func TestShortCircuitOnSuccess(t *testing.T) {
 
 	subgraphName := "sg"
 	mgr := newTestManager(simpleShouldRetry, func(_ int, _ *http.Request, _ *http.Response, _ time.Duration, _ error) {
-		t.Error("OnRetry should not be called when first request succeeds")
+		t.Error("onRetry should not be called when first request succeeds")
 	}, RetryOptions{MaxRetryCount: 5, Interval: 1 * time.Millisecond, MaxDuration: 10 * time.Millisecond}, subgraphName)
 
 	tr := NewRetryHTTPTransport(&MockTransport{
@@ -445,7 +448,7 @@ func TestOnRetryCallbackInvoked(t *testing.T) {
 	assert.NoError(t, err)
 	assert.Equal(t, http.StatusOK, resp.StatusCode)
 
-	// Verify OnRetry was called the right number of times
+	// Verify onRetry was called the right number of times
 	assert.Equal(t, maxRetries, retries)
 	assert.Len(t, retryCallbacks, maxRetries)
 
@@ -1015,3 +1018,57 @@ func TestShouldUseRetryAfter(t *testing.T) {
 		})
 	}
 }
+
+func TestManager_Initialize_WarnsOnUnknownSubgraph(t *testing.T) {
+	routerConfig := &nodev1.RouterConfig{
+		Subgraphs: []*nodev1.Subgraph{
+			{Id: "known-id", Name: "known", RoutingUrl: "http://localhost:8001/graphql"},
+		},
+	}
+
+	t.Run("warns for each retry config whose subgraph is not in the router config", func(t *testing.T) {
+		core, logs := observer.New(zapcore.WarnLevel)
+		mgr := NewManager(expr.NewRetryExpressionManager(), nil, nil, zap.New(core))
+
+		err := mgr.Initialize(
+			RetryOptions{},
+			map[string]RetryOptions{
+				"known":      {Enabled: true, Algorithm: BackoffJitter, Expression: "true"},
+				"typo_name":  {Enabled: true, Algorithm: BackoffJitter, Expression: "true"},
+				"other_typo": {Enabled: false},
+			},
+			routerConfig,
+		)
+		require.NoError(t, err)
+
+		warnings := logs.FilterMessage("Retry config references unknown subgraph; override will be ignored").All()
+		require.Len(t, warnings, 2)
+
+		got := make(map[string]bool, len(warnings))
+		for _, entry := range warnings {
+			got[entry.ContextMap()["subgraph_name"].(string)] = true
+		}
+		require.True(t, got["typo_name"])
+		require.True(t, got["other_typo"])
+
+		// Unknown-named overrides must not leak into the retries map
+		require.Nil(t, mgr.GetSubgraphOptions("typo_name"))
+		require.Nil(t, mgr.GetSubgraphOptions("other_typo"))
+		require.NotNil(t, mgr.GetSubgraphOptions("known"))
+	})
+
+	t.Run("does not warn when every retry config matches a known subgraph", func(t *testing.T) {
+		core, logs := observer.New(zapcore.WarnLevel)
+		mgr := NewManager(expr.NewRetryExpressionManager(), nil, nil, zap.New(core))
+
+		err := mgr.Initialize(
+			RetryOptions{},
+			map[string]RetryOptions{
+				"known": {Enabled: true, Algorithm: BackoffJitter, Expression: "true"},
+			},
+			routerConfig,
+		)
+		require.NoError(t, err)
+		require.Zero(t, logs.FilterMessage("Retry config references unknown subgraph; override will be ignored").Len())
+	})
+}