Azure · tvaron3 · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026
@@ -12,7 +12,7 @@
 
 * Fixed missing OTel tracing spans for internal queries executed by `ReadManyItems`. Each per-partition query page now creates a `query_items` span, matching the tracing behavior of `NewQueryItemsPager`. See [PR 26813](https://github.com/Azure/azure-sdk-for-go/pull/26813).
 * 403/`WriteForbidden` retries refresh the global endpoint manager fire-and-forget (CAS-gated) instead of blocking on a synchronous `gem.Update`. See [PR 26889](https://github.com/Azure/azure-sdk-for-go/pull/26889).
-* Connection-error retry policy now attempts up to 3 retries against the current region before failing over, and performs at most one cross-region failover per call. Cross-region failover for writes only occurs when the error proves the request never reached the service (DNS, dial, TLS handshake, `ECONNREFUSED`, etc.); writes on ambiguous transport failures (e.g. `ECONNRESET`, `EOF`, transport-level timeouts) no longer fail over to another region, avoiding potential duplicate writes. Reads still fail over for any transport error. Caller-set context deadlines or cancellations short-circuit the policy without consuming the caller's budget with retries. See [PR 26858](https://github.com/Azure/azure-sdk-for-go/pull/26858).
+* Connection-error retry policy now attempts up to 3 retries against the current region before failing over, and performs at most one cross-region failover per call. Cross-region failover for writes only occurs when the error proves the request never reached the service (DNS, dial, TLS handshake, `ECONNREFUSED`, etc.); writes on ambiguous transport failures (e.g. `ECONNRESET`, `EOF`, transport-level timeouts) no longer fail over to another region, avoiding potential duplicate writes. Reads still fail over for any transport error. Caller-set context deadlines or cancellations short-circuit the policy without consuming the caller's budget with retries. See [PR 26858](https://github.com/Azure/azure-sdk-for-go/pull/26858) and [PR 26915](https://github.com/Azure/azure-sdk-for-go/pull/26915).
 * HTTP `408 Request Timeout` responses are now handled by the Cosmos client retry policy: reads are retried exactly once against another region, and writes are returned to the caller immediately to avoid potential duplicates. See [PR 26858](https://github.com/Azure/azure-sdk-for-go/pull/26858).
 * Fixed excessive `GetDatabaseAccount` HTTP calls when using preferred regions, and stopped data-plane retries from trailing into the customer-supplied (default) endpoint once account topology is populated. See [PR 26815](https://github.com/Azure/azure-sdk-for-go/pull/26815).
 * Partition key range cache now serves concurrent callers from a single in-flight refresh per container, and the cached routing map remains readable while a refresh is in progress. The refresh runs on a detached background `context.Background()` so a caller's cancellation no longer aborts the shared fetch for other waiters; each caller continues to honor its own context deadline. See [PR 26855](https://github.com/Azure/azure-sdk-for-go/pull/26855).

@@ -289,51 +289,35 @@ func (p *clientRetryPolicy) shouldRetryStatus(status int, subStatus string) (sho
 	return false
 }
 
-// attemptRetryOnNetworkError decides how to respond to a transport-level
-// failure. While the policy is enabled (enableCrossRegionRetries), the
-// first maxSameRegionConnectionRetries attempts retry against the same
-// region (the currently-resolved endpoint) without touching the location
-// cache. Once that budget is exhausted, exactly one cross-region failover
-// is attempted, subject to write-safety rules:
-//   - reads always fail over;
-//   - writes only fail over when the error is classified as
-//     connectionErrorNotSent (i.e. we are sure the request never reached
-//     the service). Writes on ambiguous errors stop retrying to avoid
-//     duplicate side-effects, and mark the endpoint unavailable so
-//     concurrent requests learn about the regional outage.
+// attemptRetryOnNetworkError handles transport-level failures. With
+// cross-region retries enabled, it allows up to maxSameRegionConnectionRetries
+// against the current region (reads always, writes only when
+// connectionErrorNotSent so non-idempotent mutations are never replayed),
+// then performs at most one cross-region failover.
 //
-// Ambiguous-error writes also skip the same-region budget: replaying a
-// non-idempotent mutation (e.g. PatchItem(Increment), TransactionalBatch)
-// up to 3 times against the same region could silently produce up to 4
-// applications of the operation. Reads can always be retried.
+// MarkEndpointUnavailable* is only invoked for connectionErrorNotSent;
+// a single mid-exchange failure is too weak a signal to declare the
+// whole region unavailable for concurrent and future traffic. As a
+// result, the ambiguous-read failover cannot rely on demote-to-tail —
+// it bumps retryContext.retryCount so the Do loop resolves a different
+// locationIndex on the next iteration (mirroring the 503/408 paths).
 //
-// After the single cross-region failover, any further connection error
-// stops retrying — the policy does not chain failovers across regions.
-//
-// When enableCrossRegionRetries is false the policy performs no retries
-// at all; the caller has explicitly opted into a "fail fast" mode.
-//
-// transportErr is the underlying transport error from req.Next(); it is
-// preserved alongside the caller's context error if the backoff is
-// interrupted by ctx cancellation, so callers can errors.Is the result
-// against both context.DeadlineExceeded and the transport error class.
+// transportErr is preserved alongside the caller's context error when
+// the backoff is interrupted by ctx cancellation, so callers can
+// errors.Is against both context.DeadlineExceeded and the transport
+// error class.
 func (p *clientRetryPolicy) attemptRetryOnNetworkError(req *policy.Request, kind connectionErrorKind, isWriteOperation bool, transportErr error, retryContext *retryContext) (bool, error) {
 	if retryContext.retryCount > maxRetryCount {
 		return false, nil
 	}
-	// If the caller disabled cross-region retries we treat that as a
-	// blanket opt-out from any retries performed by this policy,
-	// preserving the pre-existing "fail fast" semantics.
+	// Caller opted out of any retries.
 	if !p.gem.locationCache.enableCrossRegionRetries {
 		return false, nil
 	}
 
-	// While still on the original region, allow the same-region budget,
-	// but only for operations where retry is safe: reads always, writes
-	// only when we can prove the request never reached the service.
-	// Ambiguous-error writes skip this branch entirely so a
-	// non-idempotent mutation that may have been applied server-side is
-	// not silently replayed.
+	// Same-region budget: reads always, writes only when we can prove
+	// the request never reached the service (avoids replaying
+	// non-idempotent mutations).
 	if !retryContext.crossRegionFailoverDone &&
 		retryContext.sameRegionRetryCount < maxSameRegionConnectionRetries &&
 		(!isWriteOperation || kind == connectionErrorNotSent) {
@@ -344,59 +328,52 @@ func (p *clientRetryPolicy) attemptRetryOnNetworkError(req *policy.Request, kind
 		return true, nil
 	}
 
-	// We've either exhausted the same-region budget or already failed
-	// over once. We only ever perform a single cross-region failover
-	// from this policy; further connection failures bubble up to the
-	// caller.
+	// At most one cross-region failover per request.
 	if retryContext.crossRegionFailoverDone {
 		return false, nil
 	}
 
-	// Decide whether a cross-region failover is even possible for this
-	// operation before mutating any shared state. Writes on a
-	// single-master account cannot meaningfully fail over (there is only
-	// one write region), so don't bother marking the endpoint
-	// unavailable — that would just degrade the cache for everyone
-	// without producing a different endpoint.
 	canCrossRegionWrite := !isWriteOperation || p.gem.CanUseMultipleWriteLocations()
 	if isWriteOperation && (kind != connectionErrorNotSent || !canCrossRegionWrite) {
-		// Ambiguous failure or single-master write: cannot safely retry
-		// on another region. Mark unavailable for reads (concurrent
-		// readers learn), but not for writes on single-master (nowhere
-		// else to send writes). No forced gem.Update: the invalidate()
-		// inside MarkEndpointUnavailable* will arm the next non-force
-		// Update on its own, and a connection error is not a topology
-		// change.
-		if _, err := p.gem.MarkEndpointUnavailableForRead(*req.Raw().URL); err != nil {
-			return false, err
-		}
-		if canCrossRegionWrite {
-			if _, err := p.gem.MarkEndpointUnavailableForWrite(*req.Raw().URL); err != nil {
+		// Ambiguous write OR single-master write: cannot safely retry.
+		// Only mark when NotSent (single-master write); ambiguous is
+		// too weak a signal. No forced gem.Update: invalidate() inside
+		// MarkEndpointUnavailable* arms the next non-force Update.
+		if kind == connectionErrorNotSent {
+			if _, err := p.gem.MarkEndpointUnavailableForRead(*req.Raw().URL); err != nil {
 				return false, err
 			}
+			if canCrossRegionWrite {
+				if _, err := p.gem.MarkEndpointUnavailableForWrite(*req.Raw().URL); err != nil {
+					return false, err
+				}
+			}
 		}
 		return false, nil
 	}
 
-	if _, err := p.gem.MarkEndpointUnavailableForRead(*req.Raw().URL); err != nil {
-		return false, err
-	}
-	if isWriteOperation {
-		if _, err := p.gem.MarkEndpointUnavailableForWrite(*req.Raw().URL); err != nil {
+	// Cross-region failover: reads (any kind) or NotSent multi-master
+	// writes (ambiguous writes are gated above).
+	if kind == connectionErrorNotSent {
+		// Mark + demote-to-tail; resolveFromHead pins the next resolve
+		// to index 0, which now points at the failover region.
+		if _, err := p.gem.MarkEndpointUnavailableForRead(*req.Raw().URL); err != nil {
 			return false, err
 		}
+		if isWriteOperation {
+			if _, err := p.gem.MarkEndpointUnavailableForWrite(*req.Raw().URL); err != nil {
+				return false, err
+			}
+		}
+		retryContext.resolveFromHead = true
+	} else {
+		// Ambiguous read: don't mark. Route to the next region by
+		// bumping retryCount (Do loop uses it as locationIndex when
+		// resolveFromHead is not set). Mirrors the 503/408 paths.
+		retryContext.retryCount += 1
 	}
-	// No forced gem.Update: gating the failover on it would surface a
-	// metadata-endpoint timeout (the global FQDN often resolves to the
-	// same regional FE pool we just marked unavailable) and skip the
-	// cross-region retry. invalidate() inside MarkEndpointUnavailable*
-	// arms the next non-force Update for real topology changes.
 
 	retryContext.sameRegionRetryCount = 0
-	// Demote-to-tail leaves the bad endpoint at index 1+; force the
-	// next resolve to use index 0 instead of the (possibly bumped)
-	// retryCount, otherwise we'd route right back to the demoted slot.
-	retryContext.resolveFromHead = true
 	retryContext.crossRegionFailoverDone = true
 	if sleepErr := sleepWithContext(req.Raw().Context(), defaultBackoff*time.Second); sleepErr != nil {
 		return false, fmt.Errorf("%w: underlying transport error: %v", sleepErr, transportErr)

@@ -813,23 +813,80 @@ func TestAmbiguousConnectionErrorWriteDoesNotFailOver(t *testing.T) {
 }
 
 func TestAmbiguousConnectionErrorReadFailsOver(t *testing.T) {
-	client, srv, verifier, cleanup := setupRetryPolicyTestClient(t)
-	defer cleanup()
+	// Verify the ambiguous-read failover actually routes the in-flight
+	// retry to a different region. The new mechanism (bump retryCount
+	// instead of mark + demote) must move us from badSrv to goodSrv
+	// without touching the location cache.
+	badSrv, badClose := mock.NewTLSServer()
+	defer badClose()
+	goodSrv, goodClose := mock.NewTLSServer()
+	defer goodClose()
+
+	badURL, err := url.Parse(badSrv.URL())
+	require.NoError(t, err)
+	goodURL, err := url.Parse(goodSrv.URL())
+	require.NoError(t, err)
+
+	gemServer, gemClose := mock.NewTLSServer()
+	defer gemClose()
+	gemServer.SetError(&net.DNSError{})
+	internalPipeline := azruntime.NewPipeline("azcosmosgemtest", "v1.0.0", azruntime.PipelineOptions{}, &policy.ClientOptions{Transport: gemServer})
+
+	lc := newLocationCache([]string{"East US", "Central US"}, *badURL, true /*enableCrossRegionRetries*/)
+	require.NoError(t, lc.update(
+		[]accountRegion{{Name: "East US", Endpoint: badSrv.URL()}},
+		[]accountRegion{
+			{Name: "East US", Endpoint: badSrv.URL()},
+			{Name: "Central US", Endpoint: goodSrv.URL()},
+		},
+		[]string{"East US", "Central US"},
+		nil,
+	))
+
+	gem := &globalEndpointManager{
+		clientEndpoint:      gemServer.URL(),
+		pipeline:            internalPipeline,
+		preferredLocations:  []string{"East US", "Central US"},
+		locationCache:       lc,
+		refreshTimeInterval: defaultExpirationTime,
+		lastUpdateTime:      time.Time{},
+	}
+
+	routingTransport := routingMockTransport{
+		byHost: map[string]*mock.Server{
+			badURL.Host:  badSrv,
+			goodURL.Host: goodSrv,
+		},
+	}
+
+	retryPolicy := &clientRetryPolicy{gem: gem}
+	verifier := &clientRetryPolicyVerifier{}
+	internalClient, _ := azcore.NewClient("azcosmostest", "v1.0.0", azruntime.PipelineOptions{PerRetry: []policy.Policy{verifier, retryPolicy}}, &policy.ClientOptions{Transport: &routingTransport})
+	client := &Client{endpoint: badSrv.URL(), endpointUrl: badURL, internal: internalClient, gem: gem}
 
 	ambErr := &fakeAmbiguousNetError{msg: "connection reset by peer"}
+	// 1 initial + 3 same-region retries on the bad region.
 	for i := 0; i < 4; i++ {
-		srv.AppendError(ambErr)
+		badSrv.AppendError(ambErr)
 	}
-	srv.AppendResponse(mock.WithStatusCode(200))
+	// Cross-region failover should hit the good region.
+	goodSrv.AppendResponse(mock.WithStatusCode(200))
 
 	db, _ := client.NewDatabase("database_id")
 	container, _ := db.NewContainer("container_id")
-	_, err := container.ReadItem(context.TODO(), NewPartitionKeyString("1"), "doc1", nil)
+	_, err = container.ReadItem(context.TODO(), NewPartitionKeyString("1"), "doc1", nil)
 
-	assert.NoError(t, err)
+	require.NoError(t, err, "ambiguous-read failover should reach the good region")
 	rc := verifier.requests[0].retryContext
+	assert.True(t, rc.crossRegionFailoverDone)
+	// Ambiguous failover bumps retryCount instead of marking + demoting,
+	// so ResolveServiceEndpoint(1) returns the second preferred region.
+	assert.Equal(t, 1, rc.retryCount)
 	assert.Equal(t, 0, rc.sameRegionRetryCount)
-	assert.Equal(t, 0, rc.retryCount) // post-fix: retryCount not incremented on connection-error failover; demote-in-cache handles routing
+	assert.Equal(t, 4, badSrv.Requests())
+	assert.Equal(t, 1, goodSrv.Requests())
+	assert.Empty(t, lc.locationUnavailabilityInfoMap,
+		"ambiguous read failover must not mark any endpoint unavailable")
 }
 
 func TestCallerDeadlineExceededDoesNotRetry(t *testing.T) {
@@ -1003,10 +1060,10 @@ func TestSingleMasterWriteDoesNotFailoverOnConnectionError(t *testing.T) {
 	}
 }
 
-func TestAmbiguousWriteMarksEndpointUnavailableForRead(t *testing.T) {
-	// Multi-master write that gives up on an ambiguous transport error
-	// should still mark the endpoint unavailable for read so concurrent
-	// requests learn about the regional outage.
+func TestAmbiguousWriteDoesNotMarkEndpointUnavailable(t *testing.T) {
+	// Ambiguous transport errors are too weak a signal to mark the
+	// region unavailable; the request may even have been processed
+	// server-side.
 	client, srv, verifier, cleanup := setupRetryPolicyTestClient(t)
 	defer cleanup()
 
@@ -1022,20 +1079,11 @@ func TestAmbiguousWriteMarksEndpointUnavailableForRead(t *testing.T) {
 	require.NoError(t, err)
 	_, err = container.CreateItem(context.TODO(), NewPartitionKeyString("1"), marshalled, nil)
 
-	// At least one endpoint must have been marked unavailable for write
-	// (single-master would NOT do this; we use multi-master here).
 	require.Error(t, err)
 	rc := verifier.requests[0].retryContext
 	assert.False(t, rc.crossRegionFailoverDone)
-	// Marked unavailable for read for at least one endpoint.
-	var markedForRead bool
-	for _, info := range client.gem.locationCache.locationUnavailabilityInfoMap {
-		if info.unavailableOps == read || info.unavailableOps == all {
-			markedForRead = true
-			break
-		}
-	}
-	assert.True(t, markedForRead, "expected at least one endpoint marked unavailable for read")
+	assert.Empty(t, client.gem.locationCache.locationUnavailabilityInfoMap,
+		"ambiguous write must not mark any endpoint unavailable")
 }
 
 func TestConnectionErrorWithCrossRegionRetriesDisabledFailsFast(t *testing.T) {