diff --git a/sdk/data/azcosmos/CHANGELOG.md b/sdk/data/azcosmos/CHANGELOG.md index 905d7fac617e..67f3b633b9c0 100644 --- a/sdk/data/azcosmos/CHANGELOG.md +++ b/sdk/data/azcosmos/CHANGELOG.md @@ -12,7 +12,7 @@ * Fixed missing OTel tracing spans for internal queries executed by `ReadManyItems`. Each per-partition query page now creates a `query_items` span, matching the tracing behavior of `NewQueryItemsPager`. See [PR 26813](https://github.com/Azure/azure-sdk-for-go/pull/26813). * 403/`WriteForbidden` retries refresh the global endpoint manager fire-and-forget (CAS-gated) instead of blocking on a synchronous `gem.Update`. See [PR 26889](https://github.com/Azure/azure-sdk-for-go/pull/26889). -* Connection-error retry policy now attempts up to 3 retries against the current region before failing over, and performs at most one cross-region failover per call. Cross-region failover for writes only occurs when the error proves the request never reached the service (DNS, dial, TLS handshake, `ECONNREFUSED`, etc.); writes on ambiguous transport failures (e.g. `ECONNRESET`, `EOF`, transport-level timeouts) no longer fail over to another region, avoiding potential duplicate writes. Reads still fail over for any transport error. Caller-set context deadlines or cancellations short-circuit the policy without consuming the caller's budget with retries. See [PR 26858](https://github.com/Azure/azure-sdk-for-go/pull/26858). +* Connection-error retry policy now attempts up to 3 retries against the current region before failing over, and performs at most one cross-region failover per call. Cross-region failover for writes only occurs when the error proves the request never reached the service (DNS, dial, TLS handshake, `ECONNREFUSED`, etc.); writes on ambiguous transport failures (e.g. `ECONNRESET`, `EOF`, transport-level timeouts) no longer fail over to another region, avoiding potential duplicate writes. Reads still fail over for any transport error. Caller-set context deadlines or cancellations short-circuit the policy without consuming the caller's budget with retries. See [PR 26858](https://github.com/Azure/azure-sdk-for-go/pull/26858) and [PR 26915](https://github.com/Azure/azure-sdk-for-go/pull/26915). * HTTP `408 Request Timeout` responses are now handled by the Cosmos client retry policy: reads are retried exactly once against another region, and writes are returned to the caller immediately to avoid potential duplicates. See [PR 26858](https://github.com/Azure/azure-sdk-for-go/pull/26858). * Fixed excessive `GetDatabaseAccount` HTTP calls when using preferred regions, and stopped data-plane retries from trailing into the customer-supplied (default) endpoint once account topology is populated. See [PR 26815](https://github.com/Azure/azure-sdk-for-go/pull/26815). * Partition key range cache now serves concurrent callers from a single in-flight refresh per container, and the cached routing map remains readable while a refresh is in progress. The refresh runs on a detached background `context.Background()` so a caller's cancellation no longer aborts the shared fetch for other waiters; each caller continues to honor its own context deadline. See [PR 26855](https://github.com/Azure/azure-sdk-for-go/pull/26855). diff --git a/sdk/data/azcosmos/cosmos_client_retry_policy.go b/sdk/data/azcosmos/cosmos_client_retry_policy.go index 1f78a708eda5..c3c154dc4aae 100644 --- a/sdk/data/azcosmos/cosmos_client_retry_policy.go +++ b/sdk/data/azcosmos/cosmos_client_retry_policy.go @@ -289,51 +289,35 @@ func (p *clientRetryPolicy) shouldRetryStatus(status int, subStatus string) (sho return false } -// attemptRetryOnNetworkError decides how to respond to a transport-level -// failure. While the policy is enabled (enableCrossRegionRetries), the -// first maxSameRegionConnectionRetries attempts retry against the same -// region (the currently-resolved endpoint) without touching the location -// cache. Once that budget is exhausted, exactly one cross-region failover -// is attempted, subject to write-safety rules: -// - reads always fail over; -// - writes only fail over when the error is classified as -// connectionErrorNotSent (i.e. we are sure the request never reached -// the service). Writes on ambiguous errors stop retrying to avoid -// duplicate side-effects, and mark the endpoint unavailable so -// concurrent requests learn about the regional outage. +// attemptRetryOnNetworkError handles transport-level failures. With +// cross-region retries enabled, it allows up to maxSameRegionConnectionRetries +// against the current region (reads always, writes only when +// connectionErrorNotSent so non-idempotent mutations are never replayed), +// then performs at most one cross-region failover. // -// Ambiguous-error writes also skip the same-region budget: replaying a -// non-idempotent mutation (e.g. PatchItem(Increment), TransactionalBatch) -// up to 3 times against the same region could silently produce up to 4 -// applications of the operation. Reads can always be retried. +// MarkEndpointUnavailable* is only invoked for connectionErrorNotSent; +// a single mid-exchange failure is too weak a signal to declare the +// whole region unavailable for concurrent and future traffic. As a +// result, the ambiguous-read failover cannot rely on demote-to-tail — +// it bumps retryContext.retryCount so the Do loop resolves a different +// locationIndex on the next iteration (mirroring the 503/408 paths). // -// After the single cross-region failover, any further connection error -// stops retrying — the policy does not chain failovers across regions. -// -// When enableCrossRegionRetries is false the policy performs no retries -// at all; the caller has explicitly opted into a "fail fast" mode. -// -// transportErr is the underlying transport error from req.Next(); it is -// preserved alongside the caller's context error if the backoff is -// interrupted by ctx cancellation, so callers can errors.Is the result -// against both context.DeadlineExceeded and the transport error class. +// transportErr is preserved alongside the caller's context error when +// the backoff is interrupted by ctx cancellation, so callers can +// errors.Is against both context.DeadlineExceeded and the transport +// error class. func (p *clientRetryPolicy) attemptRetryOnNetworkError(req *policy.Request, kind connectionErrorKind, isWriteOperation bool, transportErr error, retryContext *retryContext) (bool, error) { if retryContext.retryCount > maxRetryCount { return false, nil } - // If the caller disabled cross-region retries we treat that as a - // blanket opt-out from any retries performed by this policy, - // preserving the pre-existing "fail fast" semantics. + // Caller opted out of any retries. if !p.gem.locationCache.enableCrossRegionRetries { return false, nil } - // While still on the original region, allow the same-region budget, - // but only for operations where retry is safe: reads always, writes - // only when we can prove the request never reached the service. - // Ambiguous-error writes skip this branch entirely so a - // non-idempotent mutation that may have been applied server-side is - // not silently replayed. + // Same-region budget: reads always, writes only when we can prove + // the request never reached the service (avoids replaying + // non-idempotent mutations). if !retryContext.crossRegionFailoverDone && retryContext.sameRegionRetryCount < maxSameRegionConnectionRetries && (!isWriteOperation || kind == connectionErrorNotSent) { @@ -344,59 +328,52 @@ func (p *clientRetryPolicy) attemptRetryOnNetworkError(req *policy.Request, kind return true, nil } - // We've either exhausted the same-region budget or already failed - // over once. We only ever perform a single cross-region failover - // from this policy; further connection failures bubble up to the - // caller. + // At most one cross-region failover per request. if retryContext.crossRegionFailoverDone { return false, nil } - // Decide whether a cross-region failover is even possible for this - // operation before mutating any shared state. Writes on a - // single-master account cannot meaningfully fail over (there is only - // one write region), so don't bother marking the endpoint - // unavailable — that would just degrade the cache for everyone - // without producing a different endpoint. canCrossRegionWrite := !isWriteOperation || p.gem.CanUseMultipleWriteLocations() if isWriteOperation && (kind != connectionErrorNotSent || !canCrossRegionWrite) { - // Ambiguous failure or single-master write: cannot safely retry - // on another region. Mark unavailable for reads (concurrent - // readers learn), but not for writes on single-master (nowhere - // else to send writes). No forced gem.Update: the invalidate() - // inside MarkEndpointUnavailable* will arm the next non-force - // Update on its own, and a connection error is not a topology - // change. - if _, err := p.gem.MarkEndpointUnavailableForRead(*req.Raw().URL); err != nil { - return false, err - } - if canCrossRegionWrite { - if _, err := p.gem.MarkEndpointUnavailableForWrite(*req.Raw().URL); err != nil { + // Ambiguous write OR single-master write: cannot safely retry. + // Only mark when NotSent (single-master write); ambiguous is + // too weak a signal. No forced gem.Update: invalidate() inside + // MarkEndpointUnavailable* arms the next non-force Update. + if kind == connectionErrorNotSent { + if _, err := p.gem.MarkEndpointUnavailableForRead(*req.Raw().URL); err != nil { return false, err } + if canCrossRegionWrite { + if _, err := p.gem.MarkEndpointUnavailableForWrite(*req.Raw().URL); err != nil { + return false, err + } + } } return false, nil } - if _, err := p.gem.MarkEndpointUnavailableForRead(*req.Raw().URL); err != nil { - return false, err - } - if isWriteOperation { - if _, err := p.gem.MarkEndpointUnavailableForWrite(*req.Raw().URL); err != nil { + // Cross-region failover: reads (any kind) or NotSent multi-master + // writes (ambiguous writes are gated above). + if kind == connectionErrorNotSent { + // Mark + demote-to-tail; resolveFromHead pins the next resolve + // to index 0, which now points at the failover region. + if _, err := p.gem.MarkEndpointUnavailableForRead(*req.Raw().URL); err != nil { return false, err } + if isWriteOperation { + if _, err := p.gem.MarkEndpointUnavailableForWrite(*req.Raw().URL); err != nil { + return false, err + } + } + retryContext.resolveFromHead = true + } else { + // Ambiguous read: don't mark. Route to the next region by + // bumping retryCount (Do loop uses it as locationIndex when + // resolveFromHead is not set). Mirrors the 503/408 paths. + retryContext.retryCount += 1 } - // No forced gem.Update: gating the failover on it would surface a - // metadata-endpoint timeout (the global FQDN often resolves to the - // same regional FE pool we just marked unavailable) and skip the - // cross-region retry. invalidate() inside MarkEndpointUnavailable* - // arms the next non-force Update for real topology changes. retryContext.sameRegionRetryCount = 0 - // Demote-to-tail leaves the bad endpoint at index 1+; force the - // next resolve to use index 0 instead of the (possibly bumped) - // retryCount, otherwise we'd route right back to the demoted slot. - retryContext.resolveFromHead = true retryContext.crossRegionFailoverDone = true if sleepErr := sleepWithContext(req.Raw().Context(), defaultBackoff*time.Second); sleepErr != nil { return false, fmt.Errorf("%w: underlying transport error: %v", sleepErr, transportErr) diff --git a/sdk/data/azcosmos/cosmos_client_retry_policy_test.go b/sdk/data/azcosmos/cosmos_client_retry_policy_test.go index 2acdde358d22..80df850ea0ad 100644 --- a/sdk/data/azcosmos/cosmos_client_retry_policy_test.go +++ b/sdk/data/azcosmos/cosmos_client_retry_policy_test.go @@ -813,23 +813,80 @@ func TestAmbiguousConnectionErrorWriteDoesNotFailOver(t *testing.T) { } func TestAmbiguousConnectionErrorReadFailsOver(t *testing.T) { - client, srv, verifier, cleanup := setupRetryPolicyTestClient(t) - defer cleanup() + // Verify the ambiguous-read failover actually routes the in-flight + // retry to a different region. The new mechanism (bump retryCount + // instead of mark + demote) must move us from badSrv to goodSrv + // without touching the location cache. + badSrv, badClose := mock.NewTLSServer() + defer badClose() + goodSrv, goodClose := mock.NewTLSServer() + defer goodClose() + + badURL, err := url.Parse(badSrv.URL()) + require.NoError(t, err) + goodURL, err := url.Parse(goodSrv.URL()) + require.NoError(t, err) + + gemServer, gemClose := mock.NewTLSServer() + defer gemClose() + gemServer.SetError(&net.DNSError{}) + internalPipeline := azruntime.NewPipeline("azcosmosgemtest", "v1.0.0", azruntime.PipelineOptions{}, &policy.ClientOptions{Transport: gemServer}) + + lc := newLocationCache([]string{"East US", "Central US"}, *badURL, true /*enableCrossRegionRetries*/) + require.NoError(t, lc.update( + []accountRegion{{Name: "East US", Endpoint: badSrv.URL()}}, + []accountRegion{ + {Name: "East US", Endpoint: badSrv.URL()}, + {Name: "Central US", Endpoint: goodSrv.URL()}, + }, + []string{"East US", "Central US"}, + nil, + )) + + gem := &globalEndpointManager{ + clientEndpoint: gemServer.URL(), + pipeline: internalPipeline, + preferredLocations: []string{"East US", "Central US"}, + locationCache: lc, + refreshTimeInterval: defaultExpirationTime, + lastUpdateTime: time.Time{}, + } + + routingTransport := routingMockTransport{ + byHost: map[string]*mock.Server{ + badURL.Host: badSrv, + goodURL.Host: goodSrv, + }, + } + + retryPolicy := &clientRetryPolicy{gem: gem} + verifier := &clientRetryPolicyVerifier{} + internalClient, _ := azcore.NewClient("azcosmostest", "v1.0.0", azruntime.PipelineOptions{PerRetry: []policy.Policy{verifier, retryPolicy}}, &policy.ClientOptions{Transport: &routingTransport}) + client := &Client{endpoint: badSrv.URL(), endpointUrl: badURL, internal: internalClient, gem: gem} ambErr := &fakeAmbiguousNetError{msg: "connection reset by peer"} + // 1 initial + 3 same-region retries on the bad region. for i := 0; i < 4; i++ { - srv.AppendError(ambErr) + badSrv.AppendError(ambErr) } - srv.AppendResponse(mock.WithStatusCode(200)) + // Cross-region failover should hit the good region. + goodSrv.AppendResponse(mock.WithStatusCode(200)) db, _ := client.NewDatabase("database_id") container, _ := db.NewContainer("container_id") - _, err := container.ReadItem(context.TODO(), NewPartitionKeyString("1"), "doc1", nil) + _, err = container.ReadItem(context.TODO(), NewPartitionKeyString("1"), "doc1", nil) - assert.NoError(t, err) + require.NoError(t, err, "ambiguous-read failover should reach the good region") rc := verifier.requests[0].retryContext + assert.True(t, rc.crossRegionFailoverDone) + // Ambiguous failover bumps retryCount instead of marking + demoting, + // so ResolveServiceEndpoint(1) returns the second preferred region. + assert.Equal(t, 1, rc.retryCount) assert.Equal(t, 0, rc.sameRegionRetryCount) - assert.Equal(t, 0, rc.retryCount) // post-fix: retryCount not incremented on connection-error failover; demote-in-cache handles routing + assert.Equal(t, 4, badSrv.Requests()) + assert.Equal(t, 1, goodSrv.Requests()) + assert.Empty(t, lc.locationUnavailabilityInfoMap, + "ambiguous read failover must not mark any endpoint unavailable") } func TestCallerDeadlineExceededDoesNotRetry(t *testing.T) { @@ -1003,10 +1060,10 @@ func TestSingleMasterWriteDoesNotFailoverOnConnectionError(t *testing.T) { } } -func TestAmbiguousWriteMarksEndpointUnavailableForRead(t *testing.T) { - // Multi-master write that gives up on an ambiguous transport error - // should still mark the endpoint unavailable for read so concurrent - // requests learn about the regional outage. +func TestAmbiguousWriteDoesNotMarkEndpointUnavailable(t *testing.T) { + // Ambiguous transport errors are too weak a signal to mark the + // region unavailable; the request may even have been processed + // server-side. client, srv, verifier, cleanup := setupRetryPolicyTestClient(t) defer cleanup() @@ -1022,20 +1079,11 @@ func TestAmbiguousWriteMarksEndpointUnavailableForRead(t *testing.T) { require.NoError(t, err) _, err = container.CreateItem(context.TODO(), NewPartitionKeyString("1"), marshalled, nil) - // At least one endpoint must have been marked unavailable for write - // (single-master would NOT do this; we use multi-master here). require.Error(t, err) rc := verifier.requests[0].retryContext assert.False(t, rc.crossRegionFailoverDone) - // Marked unavailable for read for at least one endpoint. - var markedForRead bool - for _, info := range client.gem.locationCache.locationUnavailabilityInfoMap { - if info.unavailableOps == read || info.unavailableOps == all { - markedForRead = true - break - } - } - assert.True(t, markedForRead, "expected at least one endpoint marked unavailable for read") + assert.Empty(t, client.gem.locationCache.locationUnavailabilityInfoMap, + "ambiguous write must not mark any endpoint unavailable") } func TestConnectionErrorWithCrossRegionRetriesDisabledFailsFast(t *testing.T) {