Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions sdk/data/azcosmos/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

### Features Added

* Added retry policy for transient `500`, `502`, and `504` server errors on read requests. The request is retried once in the current region and, if applicable, once against the next preferred region. Writes are not retried. This matches the behavior of the .NET, Java, and Python Cosmos SDKs. See [PR 26821](https://github.com/Azure/azure-sdk-for-go/pull/26821).

### Breaking Changes

### Bugs Fixed
Expand Down
67 changes: 65 additions & 2 deletions sdk/data/azcosmos/cosmos_client_retry_policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,10 @@ type retryContext struct {
retryCount int
sessionRetryCount int
preferredLocationIndex int
// serverErrorRetryCount tracks the number of retries attempted for a
// transient 5xx server error (500/502/504). Only reads are retried;
// the budget is one in-region retry followed by one cross-region retry.
serverErrorRetryCount int
// sameRegionRetryCount tracks the number of consecutive retries we have
// attempted against the currently-resolved endpoint for a connection
// error chain. It resets to 0 whenever we fail over to another region
Expand All @@ -138,6 +142,11 @@ type retryContext struct {
const maxRetryCount = 120
const defaultBackoff = 1

// maxServerErrorRetryCount is the total number of retries attempted for a
// transient 5xx server error: one in-region retry followed by one
// cross-region retry.
const maxServerErrorRetryCount = 2

// sleepWithContext sleeps for d, but returns early with the context's error
// if ctx is cancelled or its deadline expires. Use this in retry paths so
// the policy honors caller-set context deadlines instead of consuming the
Expand Down Expand Up @@ -234,6 +243,11 @@ func (p *clientRetryPolicy) Do(req *policy.Request) (*http.Response, error) {
subStatus := response.Header.Get(cosmosHeaderSubstatus)
if p.shouldRetryStatus(response.StatusCode, subStatus) {
retryContext.useWriteEndpoint = false
// advanceLocation gates whether the post-switch logic advances
// retryCount (which moves the resolved endpoint to the next
// region). An in-region 5xx retry leaves it true=>false so the
// retry targets the same endpoint.
advanceLocation := true
switch response.StatusCode {
case http.StatusForbidden:
shouldRetry, err := p.attemptRetryOnEndpointFailure(req, o.isWriteOperation, &retryContext)
Expand All @@ -259,12 +273,25 @@ func (p *clientRetryPolicy) Do(req *policy.Request) (*http.Response, error) {
if !shouldRetry {
return nil, errorinfo.NonRetriableError(azruntime.NewResponseErrorWithErrorCode(response, response.Status))
}
case http.StatusInternalServerError, http.StatusBadGateway, http.StatusGatewayTimeout:
shouldRetry, inRegion := p.attemptRetryOnServerError(o.isWriteOperation, &retryContext)
if !shouldRetry {
return nil, errorinfo.NonRetriableError(azruntime.NewResponseErrorWithErrorCode(response, response.Status))
}
// The in-region retry targets the same endpoint, so do not
// advance retryCount. The cross-region retry advances the
// location via preferredLocationIndex and retryCount.
if inRegion {
advanceLocation = false
}
}
err = req.RewindBody()
if err != nil {
return response, err
}
retryContext.retryCount += 1
if advanceLocation {
retryContext.retryCount += 1
}
// HTTP-status retries can change the endpoint (via retryCount
// or preferredLocationIndex). Reset the connection-error
// same-region budget so a fresh chain of connection errors
Expand All @@ -283,7 +310,10 @@ func (p *clientRetryPolicy) shouldRetryStatus(status int, subStatus string) (sho
if (status == http.StatusForbidden && (subStatus == subStatusWriteForbidden || subStatus == subStatusDatabaseAccountNotFound)) ||
(status == http.StatusNotFound && subStatus == subStatusReadSessionNotAvailable) ||
(status == http.StatusServiceUnavailable) ||
(status == http.StatusRequestTimeout) {
(status == http.StatusRequestTimeout) ||
(status == http.StatusInternalServerError) ||
(status == http.StatusBadGateway) ||
(status == http.StatusGatewayTimeout) {
return true
}
return false
Expand Down Expand Up @@ -510,6 +540,39 @@ func (p *clientRetryPolicy) attemptRetryOnServiceUnavailable(isWriteOperation bo
return true
}

// attemptRetryOnServerError applies the 5xx retry policy for transient server
// errors (500 Internal Server Error, 502 Bad Gateway, 504 Gateway Timeout).
// Consistent with the other Cosmos SDKs (.NET, Python, Java), only read
// operations are retried. The retry budget is one in-region retry followed by
// one cross-region retry, after which the error is surfaced to the caller. The
// cross-region retry is only attempted when cross-region retries are enabled, a
// preferred location is available to fail over to, and the location cache has
// resolved more than one read endpoint -- otherwise the "cross-region" retry
// would just hit the same endpoint as the in-region retry. The returned
// inRegion flag tells the caller whether to keep targeting the current endpoint
// (true) or to advance to the next preferred region (false).
func (p *clientRetryPolicy) attemptRetryOnServerError(isWriteOperation bool, retryContext *retryContext) (shouldRetry bool, inRegion bool) {
if isWriteOperation {
return false, false
}
if retryContext.serverErrorRetryCount >= maxServerErrorRetryCount {
return false, false
}
if retryContext.serverErrorRetryCount == 0 {
retryContext.serverErrorRetryCount += 1
return true, true
}
if !p.gem.locationCache.enableCrossRegionRetries || retryContext.preferredLocationIndex >= len(p.gem.preferredLocations) {
return false, false
}
if p.gem.locationCache.readEndpointCount() <= 1 {
return false, false
}
retryContext.serverErrorRetryCount += 1
retryContext.preferredLocationIndex += 1
Comment thread
andrewmathew1 marked this conversation as resolved.
return true, false
}

// attemptRetryOnRequestTimeout handles an HTTP 408 from the service. A
// 408 is ambiguous from a write-safety standpoint (the request may or
// may not have been processed before the server timed out), so only
Expand Down
Loading
Loading