Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions sdk/data/azcosmos/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,16 @@

### Features Added

* Added a dedicated 429 (Too Many Requests) throttling retry policy that honors the `x-ms-retry-after-ms` response header and is configurable via `ClientOptions.ThrottlingRetryOptions` (`MaxRetryAttempts`, `MaxRetryWaitTime`). This brings parity with the throttling retry behavior in the .NET, Java, and Python Cosmos SDKs. When `ClientOptions.Retry.StatusCodes` and `ClientOptions.Retry.ShouldRetry` are both unset, 429 is no longer in the azcore retry policy's default status codes (it is now handled exclusively by the throttling retry policy); the other transient status codes (408, 500, 502, 503, 504) remain.
Copy link
Copy Markdown
Member

@simorenoh simorenoh Jun 1, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
* Added a dedicated 429 (Too Many Requests) throttling retry policy that honors the `x-ms-retry-after-ms` response header and is configurable via `ClientOptions.ThrottlingRetryOptions` (`MaxRetryAttempts`, `MaxRetryWaitTime`). This brings parity with the throttling retry behavior in the .NET, Java, and Python Cosmos SDKs. When `ClientOptions.Retry.StatusCodes` and `ClientOptions.Retry.ShouldRetry` are both unset, 429 is no longer in the azcore retry policy's default status codes (it is now handled exclusively by the throttling retry policy); the other transient status codes (408, 500, 502, 503, 504) remain.
* Added a dedicated 429 (Too Many Requests) throttling retry policy that honors the `x-ms-retry-after-ms` response header and is configurable via `ClientOptions.ThrottlingRetryOptions` (`MaxRetryAttempts`, `MaxRetryWaitTime`). When `ClientOptions.Retry.StatusCodes` and `ClientOptions.Retry.ShouldRetry` are both unset, 429 is no longer in the azcore retry policy's default status codes (it is now handled exclusively by the throttling retry policy); the other transient status codes (408, 500, 502, 503, 504) remain. See [PR 26820](https://github.com/Azure/azure-sdk-for-go/pull/26820).


### Breaking Changes

### Bugs Fixed

### Other Changes

* Throttling retry policy: an explicit `x-ms-retry-after-ms: 0` header is now honored as "retry immediately" instead of being treated as a missing header (which would have applied the default delay). NaN/Inf values for the header are now rejected as invalid. The request body is rewound before the 429 response body is drained so a rewind failure surfaces a usable 429 response to the caller.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we probably don't need this since these changes are also in this PR for the first time right?


## 1.5.0-beta.6 (2026-05-15)

### Features Added
Expand Down
28 changes: 27 additions & 1 deletion sdk/data/azcosmos/cosmos_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,16 @@ func newClient(authPolicy policy.Policy, gem *globalEndpointManager, options *Cl
if options == nil {
options = &ClientOptions{}
}
// Copy the embedded azcore.ClientOptions so adjustments to retry defaults
// don't mutate the caller-supplied struct. The throttleRetryPolicy below
// owns 429 handling with Cosmos-specific semantics (x-ms-retry-after-ms
// header, cumulative wait budget), so when the caller hasn't customized
// the retry status codes or supplied a ShouldRetry callback, exclude 429
// from azcore's default retry list to avoid double-retry.
clientOpts := options.ClientOptions
if clientOpts.Retry.StatusCodes == nil && clientOpts.Retry.ShouldRetry == nil {
clientOpts.Retry.StatusCodes = defaultAzcoreRetryStatusCodesWithout429()
}
return azcore.NewClient(moduleName, serviceLibVersion,
azruntime.PipelineOptions{
AllowedHeaders: getAllowedHeaders(),
Expand All @@ -197,13 +207,29 @@ func newClient(authPolicy policy.Policy, gem *globalEndpointManager, options *Cl
},
PerRetry: []policy.Policy{
authPolicy,
newThrottleRetryPolicy(&options.ThrottlingRetryOptions),
&clientRetryPolicy{gem: gem},
},
Tracing: azruntime.TracingOptions{
Namespace: "Microsoft.DocumentDB",
},
},
&options.ClientOptions)
&clientOpts)
}

// defaultAzcoreRetryStatusCodesWithout429 returns azcore's default retryable
// HTTP status codes with 429 removed. The Cosmos throttleRetryPolicy already
// retries 429 with x-ms-retry-after-ms semantics and a cumulative wait budget,
// so layering azcore's default 429 retry on top would result in compounded
// retry attempts.
func defaultAzcoreRetryStatusCodesWithout429() []int {
return []int{
http.StatusRequestTimeout, // 408
http.StatusInternalServerError, // 500
http.StatusBadGateway, // 502
http.StatusServiceUnavailable, // 503
http.StatusGatewayTimeout, // 504
}
}

func newInternalPipeline(authPolicy policy.Policy, options *ClientOptions) azruntime.Pipeline {
Expand Down
22 changes: 22 additions & 0 deletions sdk/data/azcosmos/cosmos_client_options.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
package azcosmos

import (
"time"

"github.com/Azure/azure-sdk-for-go/sdk/azcore"
)

Expand All @@ -25,4 +27,24 @@ type ClientOptions struct {
// The valid range is 1 to 5 (inclusive).
// Can be overridden per-request via the operation options.
ThroughputBucket *int32
// ThrottlingRetryOptions configures how the client retries requests that fail with
// HTTP 429 (Too Many Requests). When unset, defaults consistent with the other
// Cosmos SDKs are used (9 attempts, 30s cumulative wait).
ThrottlingRetryOptions ThrottlingRetryOptions
}

// ThrottlingRetryOptions configures the retry behavior for HTTP 429
// (Too Many Requests) responses. The Cosmos service indicates the recommended
// retry delay via the x-ms-retry-after-ms response header; the client respects
// that value subject to the limits in this struct.
type ThrottlingRetryOptions struct {
// MaxRetryAttempts is the maximum number of times the client will retry a
// throttled request. The default is 9. Set to a negative value to disable
// throttling retries.
MaxRetryAttempts int
// MaxRetryWaitTime is the maximum cumulative time the client will spend
// waiting between throttled retries for a single request. Once this budget
// is exhausted, the most recent 429 response is returned to the caller.
// The default is 30 seconds.
MaxRetryWaitTime time.Duration
}
1 change: 1 addition & 0 deletions sdk/data/azcosmos/cosmos_http_constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ const (
headerDedicatedGatewayBypassCache string = "x-ms-dedicatedgateway-bypass-cache"
cosmosHeaderPriorityLevel string = "x-ms-cosmos-priority-level"
cosmosHeaderThroughputBucket string = "x-ms-cosmos-throughput-bucket"
cosmosHeaderRetryAfterMs string = "x-ms-retry-after-ms"
)

const (
Expand Down
132 changes: 132 additions & 0 deletions sdk/data/azcosmos/cosmos_throttle_retry_policy.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

package azcosmos

import (
"math"
"net/http"
"strconv"
"time"

azlog "github.com/Azure/azure-sdk-for-go/sdk/azcore/log"
"github.com/Azure/azure-sdk-for-go/sdk/azcore/policy"
azruntime "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime"
"github.com/Azure/azure-sdk-for-go/sdk/internal/log"
)

const (
defaultMaxThrottleRetryAttempts = 9
defaultMaxThrottleRetryWaitTime = 30 * time.Second
defaultThrottleRetryDelay = 5 * time.Second
)

// throttleRetryPolicy retries requests that fail with HTTP 429 (Too Many Requests).
// It honors the Cosmos-specific x-ms-retry-after-ms header to determine the
// delay between attempts and caps the number of attempts and total cumulative
// retry delay. This matches the throttling retry behavior of the other Cosmos
// SDKs (.NET, Java, Python).
type throttleRetryPolicy struct {
maxRetryAttempts int
maxRetryWaitTime time.Duration
// defaultDelay is used when a 429 response is missing the
// x-ms-retry-after-ms header. Defaults to defaultThrottleRetryDelay.
defaultDelay time.Duration
}

// newThrottleRetryPolicy constructs a throttleRetryPolicy. For MaxRetryAttempts,
// a positive value is used as the cap, zero falls back to the default
// (defaultMaxThrottleRetryAttempts), and a negative value disables throttling
// retries entirely. For MaxRetryWaitTime, a non-positive value falls back to
// the default (defaultMaxThrottleRetryWaitTime).
func newThrottleRetryPolicy(o *ThrottlingRetryOptions) *throttleRetryPolicy {
p := &throttleRetryPolicy{
maxRetryAttempts: defaultMaxThrottleRetryAttempts,
maxRetryWaitTime: defaultMaxThrottleRetryWaitTime,
defaultDelay: defaultThrottleRetryDelay,
}
if o != nil {
if o.MaxRetryAttempts > 0 {
p.maxRetryAttempts = o.MaxRetryAttempts
} else if o.MaxRetryAttempts < 0 {
// negative values disable throttling retries entirely
p.maxRetryAttempts = 0
}
if o.MaxRetryWaitTime > 0 {
p.maxRetryWaitTime = o.MaxRetryWaitTime
}
}
return p
}

func (p *throttleRetryPolicy) Do(req *policy.Request) (*http.Response, error) {
attemptCount := 0
cumulativeDelay := time.Duration(0)
for {
response, err := req.Next()
// Transport / non-HTTP errors are not throttling; let other policies decide.
if err != nil || response == nil || response.StatusCode != http.StatusTooManyRequests {
return response, err
}

if attemptCount >= p.maxRetryAttempts {
log.Writef(azlog.EventRetryPolicy, "Cosmos throttle retry exhausted attempts (%d); returning 429 to caller", p.maxRetryAttempts)
return response, nil
}

delay, ok := readRetryAfterMs(response)
if !ok {
// header missing or unparseable; fall back to the default delay.
// an explicit "0" header is honored (retry immediately).
delay = p.defaultDelay
}

if cumulativeDelay+delay > p.maxRetryWaitTime {
log.Writef(azlog.EventRetryPolicy, "Cosmos throttle retry exceeded cumulative wait time (%s); returning 429 to caller", p.maxRetryWaitTime)
return response, nil
}

cumulativeDelay += delay
attemptCount++

// Rewind the request body before discarding the response so that, if
// the body isn't seekable, the caller still receives a usable 429
// response for diagnostics.
if err := req.RewindBody(); err != nil {
return response, err
}

// drain and close the response body so the connection can be reused
azruntime.Drain(response)

log.Writef(azlog.EventRetryPolicy, "Cosmos throttle retry attempt %d after %s (cumulative %s)", attemptCount, delay, cumulativeDelay)

timer := time.NewTimer(delay)
select {
case <-timer.C:
case <-req.Raw().Context().Done():
timer.Stop()
return response, req.Raw().Context().Err()
}
}
}

// readRetryAfterMs parses the Cosmos x-ms-retry-after-ms header (milliseconds).
// Returns (delay, true) on a successful parse of a non-negative finite value
// (including an explicit "0", which means "retry immediately"). Returns
// (0, false) when the header is missing, unparseable, NaN, infinite, or
// negative so that the caller can apply a default delay only in that case.
func readRetryAfterMs(resp *http.Response) (time.Duration, bool) {
if resp == nil {
return 0, false
}
v := resp.Header.Get(cosmosHeaderRetryAfterMs)
if v == "" {
return 0, false
}
ms, err := strconv.ParseFloat(v, 64)
if err != nil || math.IsNaN(ms) || math.IsInf(ms, 0) || ms < 0 {
return 0, false
}
return time.Duration(ms * float64(time.Millisecond)), true
}
Loading
Loading