diff --git a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs index 713c114b7e..9d1ab10d1d 100644 --- a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs +++ b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs @@ -37,7 +37,7 @@ internal sealed class ClientRetryPolicy : IDocumentClientRetryPolicy private readonly IDocumentClientRetryPolicy throttlingRetry; private readonly GlobalEndpointManager globalEndpointManager; private readonly GlobalPartitionEndpointManager partitionKeyRangeLocationCache; - private readonly bool enableEndpointDiscovery; + private readonly bool enableEndpointDiscovery; private readonly bool isThinClientEnabled; private int failoverRetryCount; @@ -51,7 +51,7 @@ internal sealed class ClientRetryPolicy : IDocumentClientRetryPolicy private bool isDtxRequest; private Uri locationEndpoint; private RetryContext retryContext; - private DocumentServiceRequest documentServiceRequest; + private DocumentServiceRequest documentServiceRequest; #if !INTERNAL private volatile bool addHubRegionProcessingOnlyHeader; private CrossRegionAvailabilityContext crossRegionAvailabilityContext; @@ -61,7 +61,7 @@ public ClientRetryPolicy( GlobalEndpointManager globalEndpointManager, GlobalPartitionEndpointManager partitionKeyRangeLocationCache, RetryOptions retryOptions, - bool enableEndpointDiscovery, + bool enableEndpointDiscovery, bool isThinClientEnabled) { this.throttlingRetry = new ResourceThrottleRetryPolicy( @@ -75,7 +75,7 @@ public ClientRetryPolicy( this.sessionTokenRetryCount = 0; this.serviceUnavailableRetryCount = 0; this.canUseMultipleWriteLocations = false; - this.isMultiMasterWriteRequest = false; + this.isMultiMasterWriteRequest = false; this.isThinClientEnabled = isThinClientEnabled; } @@ -248,7 +248,7 @@ public void OnBeforeSendRequest(DocumentServiceRequest request) // set location-based routing directive based on request retry context request.RequestContext.RouteToLocation(this.retryContext.RetryLocationIndex, this.retryContext.RetryRequestOnPreferredLocations); } - } + } #if !INTERNAL // Initialize CrossRegionAvailabilityContext from Properties if not already set. // In hedging scenarios, Properties carries the shared context instance injected by @@ -277,6 +277,36 @@ public void OnBeforeSendRequest(DocumentServiceRequest request) : this.globalEndpointManager.ResolveServiceEndpoint(request); request.RequestContext.RouteToLocation(this.locationEndpoint); + + // Hedging-Detection API: tag the upcoming dispatch reason on Properties so that + // the downstream dispatch site (TransportHandler / GatewayStoreModel) can append + // a RequestedRegion entry with the correct reason. Only override when this is a + // genuine retry attempt — first-attempt dispatches default to Initial (set by + // the dispatch site), and hedge-arm dispatches have their Hedging reason set by + // CrossRegionHedgingAvailabilityStrategy before reaching this policy. + // + // Hedging preservation invariant: when a hedge arm itself triggers a retry (e.g. + // 410 Gone / 449), this method is re-entered with retryContext != null on the + // same cloned RequestMessage. The Hedging value previously seeded by + // CrossRegionHedgingAvailabilityStrategy.CloneAndSendAsync must NOT be silently + // overwritten with OperationRetry / RegionFailover, otherwise the hedge origin + // is lost from the GetRequestedRegions() sequence. Preserve the existing value + // if it is already Hedging. + if (this.retryContext != null && request.Properties != null) + { + bool alreadyTaggedAsHedging = + request.Properties.TryGetValue(Tracing.HedgingDetectionState.DispatchReasonPropertyKey, out object existingReasonObj) + && existingReasonObj is RequestedRegionReason existingReason + && existingReason == RequestedRegionReason.Hedging; + + if (!alreadyTaggedAsHedging) + { + RequestedRegionReason reason = this.retryContext.RetryRequestOnPreferredLocations + ? RequestedRegionReason.RegionFailover + : RequestedRegionReason.OperationRetry; + request.Properties[Tracing.HedgingDetectionState.DispatchReasonPropertyKey] = reason; + } + } } private async Task ShouldRetryInternalAsync( @@ -371,14 +401,14 @@ private async Task ShouldRetryInternalAsync( markBothReadAndWriteAsUnavailable: false, forceRefresh: false, retryOnPreferredLocations: true); - } - + } + if (statusCode == HttpStatusCode.NotFound && subStatusCode == SubStatusCodes.ReadSessionNotAvailable) { return this.ShouldRetryOnSessionNotAvailable(this.documentServiceRequest); - } - - // Received 503 due to client connect timeout or Gateway + } + + // Received 503 due to client connect timeout or Gateway if (statusCode == HttpStatusCode.ServiceUnavailable) { return this.TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceUnavailable( diff --git a/Microsoft.Azure.Cosmos/src/Diagnostics/CosmosDiagnostics.cs b/Microsoft.Azure.Cosmos/src/Diagnostics/CosmosDiagnostics.cs index b88eaa8efe..db18b4cb65 100644 --- a/Microsoft.Azure.Cosmos/src/Diagnostics/CosmosDiagnostics.cs +++ b/Microsoft.Azure.Cosmos/src/Diagnostics/CosmosDiagnostics.cs @@ -97,5 +97,100 @@ public virtual ServerSideCumulativeMetrics GetQueryMetrics() /// The returned list contains unique regions and doesn't guarantee ordering of the regions contacted from the first to the last /// public abstract IReadOnlyList<(string regionName, Uri uri)> GetContactedRegions(); + + /// + /// Returns true if the SDK actually dispatched this operation to a hedge + /// region as part of a cross-region availability-strategy fan-out. Returns + /// false for non-hedged operations and for hedged operations whose primary + /// responded under the configured threshold (hedge tasks were registered but never + /// awaited; no fan-out occurred). + /// + /// + /// true if at least one hedge arm was actually dispatched; otherwise false. + /// + /// + /// + /// A return value of false does NOT mean hedging was disabled or + /// misconfigured. To check whether hedging was configured on the client, + /// inspect CosmosClientOptions.AvailabilityStrategy or + /// RequestOptions.AvailabilityStrategy directly. + /// + /// + /// O(1). Safe to call on both the success path (ItemResponse<T>.Diagnostics) + /// and the error path (CosmosException.Diagnostics). Returns false for + /// diagnostics objects created outside the SDK pipeline (e.g. customer-authored + /// subclasses of ). + /// + /// + public virtual bool HedgingStarted() + { + return false; + } + + /// + /// Returns the regions the SDK actually dispatched this operation to, in observed + /// dispatch order, each tagged with the reason the SDK chose it. Includes the + /// initial attempt. + /// + /// + /// A read-only list of entries. Never null; + /// returns an empty list when there is no dispatch history (for example, when a + /// pre-flight client-side validation failed before any dispatch was attempted, or + /// for diagnostics objects created outside the SDK pipeline). + /// + /// + /// + /// The append site is the actual dispatch point; registered-but-never-awaited + /// hedge tasks do not appear here. is + /// non-exhaustive — callers MUST include a default arm when switching on + /// the enum to handle future values. + /// + /// + /// Contract is "dispatched, not necessarily wire-issued". An entry reflects + /// the SDK's decision to commit to dispatching — for hedge arms specifically, this + /// means the per-arm threshold delay elapsed without cancellation. A cancellation + /// arriving in the microsecond-wide window between that dispatch decision and the + /// underlying HTTP/RNTBD send leaving the process still leaves the entry in this + /// list. Callers should treat the list as a record of intent-to-dispatch, not a + /// record of wire-issued requests. + /// + /// + /// Duplicates are allowed (the same region can be tried multiple times across + /// retries, fail-overs and probes). Returns empty for SDK clients constructed + /// before this version's diagnostics plumbing populated it. Available on success + /// and error paths. + /// + /// + public virtual IReadOnlyList GetRequestedRegions() + { + return Array.Empty(); + } + + /// + /// Returns the regions that responded to this operation (success or failure), in + /// arrival order as observed by the SDK orchestrator. + /// + /// + /// A read-only list of region names. Never null; returns an empty list when + /// no response arrived (for example, when all dispatches were cancelled or timed + /// out) or for diagnostics objects created outside the SDK pipeline. + /// + /// + /// + /// Duplicates are allowed and expected. The same region may appear more + /// than once if it produced multiple responses (e.g., a late response after a + /// hedge winner). Count > 1 does not imply that more than one + /// distinct region responded. For unique regions, call + /// .Distinct(StringComparer.OrdinalIgnoreCase). + /// + /// + /// Returns empty for SDK clients constructed before this version's diagnostics + /// plumbing populated it. Available on success and error paths. + /// + /// + public virtual IReadOnlyList GetRespondedRegions() + { + return Array.Empty(); + } } } diff --git a/Microsoft.Azure.Cosmos/src/Diagnostics/CosmosTraceDiagnostics.cs b/Microsoft.Azure.Cosmos/src/Diagnostics/CosmosTraceDiagnostics.cs index 5eae8492ea..e07d366cb5 100644 --- a/Microsoft.Azure.Cosmos/src/Diagnostics/CosmosTraceDiagnostics.cs +++ b/Microsoft.Azure.Cosmos/src/Diagnostics/CosmosTraceDiagnostics.cs @@ -158,5 +158,25 @@ public override int GetFailedRequestCount() return this.Value.Summary.GetFailedCount(); } + + /// + public override bool HedgingStarted() + { + return this.Value?.Summary?.HedgingDetectionState?.HedgingStarted ?? false; + } + + /// + public override IReadOnlyList GetRequestedRegions() + { + return this.Value?.Summary?.HedgingDetectionState?.GetRequestedRegionsSnapshot() + ?? Array.Empty(); + } + + /// + public override IReadOnlyList GetRespondedRegions() + { + return this.Value?.Summary?.HedgingDetectionState?.GetRespondedRegionsSnapshot() + ?? Array.Empty(); + } } } \ No newline at end of file diff --git a/Microsoft.Azure.Cosmos/src/Diagnostics/RequestedRegion.cs b/Microsoft.Azure.Cosmos/src/Diagnostics/RequestedRegion.cs new file mode 100644 index 0000000000..90b4a175d6 --- /dev/null +++ b/Microsoft.Azure.Cosmos/src/Diagnostics/RequestedRegion.cs @@ -0,0 +1,103 @@ +//------------------------------------------------------------ +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------ +namespace Microsoft.Azure.Cosmos +{ + using System; + + /// + /// Represents a single region the SDK dispatched a request to as part of an operation, + /// tagged with the reason the orchestrator chose to send it. + /// + /// + /// Used by to enumerate every + /// dispatched attempt in observed dispatch order, including the initial attempt and any + /// retries, region fail-overs, hedge arms, or circuit-breaker probes. + /// This value is immutable. Equality is case-insensitive on + /// and exact on . + /// + public readonly struct RequestedRegion : IEquatable + { + /// + /// Initializes a new . + /// + /// + /// The name of the region the SDK dispatched to (e.g. "East US"). Must not be null. + /// + /// The reason the SDK chose this region for this dispatch. + /// is null. + public RequestedRegion(string regionName, RequestedRegionReason reason) + { + this.RegionName = regionName ?? throw new ArgumentNullException(nameof(regionName)); + this.Reason = reason; + } + + /// + /// Gets the name of the region the SDK dispatched to. + /// + public string RegionName { get; } + + /// + /// Gets the reason the SDK chose this region for this particular dispatch attempt. + /// + public RequestedRegionReason Reason { get; } + + /// + /// Determines whether this instance equals another by + /// comparing case-insensitively and exactly. + /// + /// The other to compare against. + /// true when both region names match (case-insensitive) and the reasons match exactly; otherwise false. + public bool Equals(RequestedRegion other) + { + return string.Equals(this.RegionName, other.RegionName, StringComparison.OrdinalIgnoreCase) + && this.Reason == other.Reason; + } + + /// + public override bool Equals(object obj) + { + return obj is RequestedRegion other && this.Equals(other); + } + + /// + public override int GetHashCode() + { + int regionHash = this.RegionName == null + ? 0 + : StringComparer.OrdinalIgnoreCase.GetHashCode(this.RegionName); + // Combine without depending on System.HashCode (net6+) to keep this constant + // across TFMs. + unchecked + { + return (regionHash * 397) ^ (byte)this.Reason; + } + } + + /// + /// Returns a human-readable representation of this in the + /// form "{regionName}:{reason}". + /// + /// A string of the form "{regionName}:{reason}". + public override string ToString() + { + return $"{this.RegionName}:{this.Reason}"; + } + + /// + /// Equality operator. + /// + /// The left operand. + /// The right operand. + /// true if the two values are equal; otherwise false. + public static bool operator ==(RequestedRegion left, RequestedRegion right) => left.Equals(right); + + /// + /// Inequality operator. + /// + /// The left operand. + /// The right operand. + /// true if the two values are not equal; otherwise false. + public static bool operator !=(RequestedRegion left, RequestedRegion right) => !left.Equals(right); + } +} diff --git a/Microsoft.Azure.Cosmos/src/Diagnostics/RequestedRegionReason.cs b/Microsoft.Azure.Cosmos/src/Diagnostics/RequestedRegionReason.cs new file mode 100644 index 0000000000..2e27ae81e3 --- /dev/null +++ b/Microsoft.Azure.Cosmos/src/Diagnostics/RequestedRegionReason.cs @@ -0,0 +1,103 @@ +//------------------------------------------------------------ +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------ +namespace Microsoft.Azure.Cosmos +{ + /// + /// Reason the SDK chose to dispatch a request to a particular region. + /// + /// + /// + /// This enum is non-exhaustive. Additional values may be added in future SDK + /// versions to describe new dispatch reasons. Callers that switch on a + /// value MUST include a default arm + /// to handle unknown values gracefully, for example: + /// + /// + /// + /// + /// + /// Default-value sentinel: is the underlying-zero value and is + /// the value of default(RequestedRegionReason) (and therefore of + /// default(RequestedRegion).Reason). The SDK never emits a + /// entry from a real dispatch; if a caller observes this value it means the + /// was produced by struct-default construction + /// (e.g. new RequestedRegion[N], an uninitialized field, or a deserialized value + /// where the reason was absent) rather than by an SDK dispatch. + /// + /// + /// Cross-SDK note: not every reason is populated by every SDK in every release. In the + /// .NET SDK, is reserved but not emitted from the SDK + /// layer in this version because the transport retry decision is owned by the closed-source + /// Microsoft.Azure.Cosmos.Direct package and is not observable from the SDK layer. + /// + /// + public enum RequestedRegionReason : byte + { + /// + /// Default sentinel for default(RequestedRegionReason) and + /// default(RequestedRegion).Reason. The SDK does NOT emit this value from a + /// real dispatch — its presence indicates the was + /// produced by struct-default construction rather than an SDK dispatch. + /// + Unknown = 0, + + /// + /// The first dispatch of the operation. Appears exactly once per operation at the + /// beginning of the dispatch sequence. + /// + Initial = 1, + + /// + /// An operation-level retry decided by the SDK's client-retry policy (e.g. retry + /// after 410 Gone, 449 Conflict-on-Write, or throttling). The retry targets the + /// same region as the previous attempt. + /// + OperationRetry = 2, + + /// + /// A transport-level retry inside the per-region transport stack (e.g. TCP reset + /// or single-frame failure). Reserved but not populated by the .NET SDK in this + /// release. Transport retries are issued by the closed-source + /// Microsoft.Azure.Cosmos.Direct package and are not observable from the + /// public SDK layer. + /// + TransportRetry = 3, + + /// + /// A speculative cross-region fan-out dispatch issued by the configured + /// . The primary (first) dispatch is recorded + /// as ; only the additional hedge arms are tagged as + /// . + /// + Hedging = 4, + + /// + /// An endpoint-failure-driven retry to a different region (write conflict re-route, + /// region marked unavailable). The SDK has marked the previous endpoint as failed + /// for this operation and chosen a different region from the preferred-locations list. + /// + RegionFailover = 5, + + /// + /// A probe dispatch to a previously circuit-broken region driven by the SDK's + /// per-partition automatic failover / per-partition per-region circuit breaker + /// (PPCB) health check. + /// + CircuitBreakerProbe = 6, + } +} diff --git a/Microsoft.Azure.Cosmos/src/Handler/TransportHandler.cs b/Microsoft.Azure.Cosmos/src/Handler/TransportHandler.cs index f7d865693c..cfdc38c3a6 100644 --- a/Microsoft.Azure.Cosmos/src/Handler/TransportHandler.cs +++ b/Microsoft.Azure.Cosmos/src/Handler/TransportHandler.cs @@ -10,6 +10,7 @@ namespace Microsoft.Azure.Cosmos.Handlers using System.Threading; using System.Threading.Tasks; using Microsoft.Azure.Cosmos.Resource.CosmosExceptions; + using Microsoft.Azure.Cosmos.Routing; using Microsoft.Azure.Cosmos.Tracing; using Microsoft.Azure.Cosmos.Tracing.TraceData; using Microsoft.Azure.Documents; @@ -93,6 +94,15 @@ internal async Task ProcessMessageAsync( DocumentServiceRequest serviceRequest = request.ToDocumentServiceRequest(); + // Hedging-Detection API: record the dispatched region + reason on the trace's + // HedgingDetectionState. This is the actual dispatch point for the operation + // (after ClientRetryPolicy.OnBeforeSendRequest has resolved the routing endpoint, + // before the wire send is invoked). Hedge arms set the reason to Hedging on + // Properties prior to entering this handler; retries set the reason to + // OperationRetry/RegionFailover via ClientRetryPolicy.OnBeforeSendRequest; all + // other first attempts default to Initial. + TransportHandler.AppendDispatchedRegion(request, serviceRequest, this.client.DocumentClient.GlobalEndpointManager); + ClientSideRequestStatisticsTraceDatum clientSideRequestStatisticsTraceDatum = new ClientSideRequestStatisticsTraceDatum(DateTime.UtcNow, request.Trace); serviceRequest.RequestContext.ClientRequestStatistics = clientSideRequestStatisticsTraceDatum; @@ -183,5 +193,100 @@ internal static ResponseMessage AggregateExceptionConverter(AggregateException a return null; } + + /// + /// Appends a entry to the operation's + /// at the dispatch point (after the routing + /// endpoint is resolved, before the wire send is invoked). + /// + /// + /// + /// The reason is read from under the + /// well-known key , which + /// upstream sites (ClientRetryPolicy, CrossRegionHedgingAvailabilityStrategy) set to + /// signal why this particular dispatch is happening. Absence of the key implies a + /// first attempt and defaults to . The key + /// is removed after consumption so that subsequent retries on the same request + /// re-default unless a new reason is set. + /// + /// + /// The region name is resolved from the routing endpoint via + /// (DocumentServiceRequest's own + /// RegionName is not populated until later in the dispatch chain, by + /// GatewayStoreModel / AddressResolver). + /// + /// + internal static void AppendDispatchedRegion( + RequestMessage requestMessage, + DocumentServiceRequest serviceRequest, + GlobalEndpointManager globalEndpointManager) + { + if (requestMessage == null || serviceRequest == null) + { + return; + } + + HedgingDetectionState state = requestMessage.Trace?.Summary?.HedgingDetectionState; + if (state == null) + { + return; + } + + // Resolve reason from the property. The Remove is deferred until AFTER a + // successful AppendRequested so a failed region resolution (e.g. thin-client / + // PPCB endpoint not in GlobalEndpointManager's read/write maps; see F4 on + // PR #5868) does not silently swallow the dispatch-reason signal. If we removed + // the key here and then bailed at the region-resolution guard, any downstream + // re-dispatch on the same DocumentServiceRequest would default to Initial even + // though the upstream caller intended a different reason. Pins F5 review + // feedback on PR #5868. + RequestedRegionReason reason = RequestedRegionReason.Initial; + bool propertyPresent = false; + if (serviceRequest.Properties != null + && serviceRequest.Properties.TryGetValue(HedgingDetectionState.DispatchReasonPropertyKey, out object reasonObj) + && reasonObj is RequestedRegionReason resolvedReason) + { + reason = resolvedReason; + propertyPresent = true; + } + + // Resolve region name from the routing endpoint URI. + string regionName = null; + Uri endpoint = serviceRequest.RequestContext?.LocationEndpointToRoute; + if (endpoint != null && globalEndpointManager != null) + { + regionName = globalEndpointManager.GetLocation(endpoint); + } + + // Skip if we couldn't resolve a name; better empty than misleading "unknown". + // Leave Properties[KEY] in place so a re-dispatch can still consume it. + if (string.IsNullOrEmpty(regionName)) + { + return; + } + + state.AppendRequested(regionName, reason); + + // Append succeeded — now safe to consume the signal so subsequent retries + // on the same DocumentServiceRequest re-default unless a new reason is set + // by an upstream site (ClientRetryPolicy or CrossRegionHedgingAvailabilityStrategy). + // + // Exception: when the reason is Hedging, LEAVE THE PROPERTY IN PLACE so that + // subsequent physical retries of this hedge arm (driven by ClientRetryPolicy + // on the same cloned RequestMessage — e.g. 410 Gone / 449 on the arm) remain + // tagged as Hedging. RequestMessage.Properties and the cached DSR's Properties + // are the same reference (see RequestMessage.ToDocumentServiceRequest), so + // removing the key here would drain it from RequestMessage.Properties too — + // and the retry-driven re-entry of ClientRetryPolicy.OnBeforeSendRequest + // would then see an empty slot and overwrite it with OperationRetry / + // RegionFailover, silently losing the hedge origin from the + // GetRequestedRegions() sequence. The preservation guard in + // ClientRetryPolicy.OnBeforeSendRequest (F3) relies on this key still being + // present at retry time. Pins F3 review feedback on PR #5868. + if (propertyPresent && reason != RequestedRegionReason.Hedging) + { + serviceRequest.Properties.Remove(HedgingDetectionState.DispatchReasonPropertyKey); + } + } } } diff --git a/Microsoft.Azure.Cosmos/src/Routing/AvailabilityStrategy/CrossRegionHedgingAvailabilityStrategy.cs b/Microsoft.Azure.Cosmos/src/Routing/AvailabilityStrategy/CrossRegionHedgingAvailabilityStrategy.cs index 39b592ca38..351b2f7f6e 100644 --- a/Microsoft.Azure.Cosmos/src/Routing/AvailabilityStrategy/CrossRegionHedgingAvailabilityStrategy.cs +++ b/Microsoft.Azure.Cosmos/src/Routing/AvailabilityStrategy/CrossRegionHedgingAvailabilityStrategy.cs @@ -329,6 +329,16 @@ private async Task CloneAndSendAsync( List excludeRegions = new List(hedgeRegions); excludeRegions.RemoveAt(requestNumber); clonedRequest.RequestOptions.ExcludeRegions = excludeRegions; + + // Hedging-Detection API: this code path is only reached AFTER the + // previous loop iteration's threshold delay elapsed without primary-wins + // cancellation. Tag the upcoming dispatch as Hedging so the downstream + // dispatch site records it with the correct reason. If this method + // is never invoked for a given requestNumber (e.g., primary wins under + // the threshold), no phantom Hedging entry is produced — see AC2/AC13 + // and design doc §12 "no phantom entries". + clonedRequest.Properties[HedgingDetectionState.DispatchReasonPropertyKey] = + RequestedRegionReason.Hedging; } try diff --git a/Microsoft.Azure.Cosmos/src/Tracing/HedgingDetectionState.cs b/Microsoft.Azure.Cosmos/src/Tracing/HedgingDetectionState.cs new file mode 100644 index 0000000000..99d60ac499 --- /dev/null +++ b/Microsoft.Azure.Cosmos/src/Tracing/HedgingDetectionState.cs @@ -0,0 +1,165 @@ +//------------------------------------------------------------ +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------ + +namespace Microsoft.Azure.Cosmos.Tracing +{ + using System; + using System.Collections.Generic; + + /// + /// Per-trace (per-operation) state backing the Hedging Detection API surface + /// (, + /// , + /// ). + /// + /// + /// + /// Lives on so it is shared across the entire trace tree + /// for a single operation. All mutators and accessors acquire the same private + /// lock, matching the shared-lock pattern used cross-SDK (see internal-spec.md §3.5 + /// and SE-017 in side-effects.json). + /// + /// + /// The new state is held here (not serialized into the trace tree as a + /// trace datum) so that the JSON shape produced by + /// is unchanged. + /// + /// +#if INTERNAL + public +#else + internal +#endif + sealed class HedgingDetectionState + { + /// + /// Well-known key used to stash a pending dispatch-reason override on + /// (or + /// 's + /// properties bag) so an upstream site (e.g. ClientRetryPolicy or the hedging + /// orchestrator) can signal the downstream dispatch site why the next attempt + /// is happening. The value must be a . + /// + internal const string DispatchReasonPropertyKey = "__CosmosDB_HedgingDetection_NextDispatchReason"; + + private readonly object regionLock = new object(); + private List requestedRegions; + private List respondedRegions; + + // Monotonic-true flag (false → true exactly once, never reset). Read lock-free + // by the public HedgingStarted getter; written only under regionLock. + // + // The volatile keyword is REQUIRED FOR THE READER SIDE: it gives the lock-free + // getter an acquire fence so the flip cannot be observed before the matching + // requestedRegions.Add that established the Hedging entry. On the writer side + // volatile's release fence is redundant — the lock release already publishes the + // store to other threads — but the write MUST stay inside regionLock so the flag + // flip remains atomic with the list Add. If a future contributor moves the write + // outside the lock as an "optimization", snapshot readers could observe + // HedgingStarted == true before the corresponding RequestedRegion is visible in + // GetRequestedRegionsSnapshot(), breaking the diagnostics invariant that + // HedgingStarted implies at least one Hedging entry exists. + // + // See F7 review feedback on PR #5868: the previous implementation took the lock + // on every read, adding an avoidable Monitor.Enter on the public + // CosmosDiagnostics.HedgingStarted() hot path. + private volatile bool hedgingStarted; + + /// + /// Appends a dispatched-region entry. No-op if is + /// null or empty. When is + /// the HedgingStarted flag is + /// also flipped to true. + /// + internal void AppendRequested(string regionName, RequestedRegionReason reason) + { + if (string.IsNullOrEmpty(regionName)) + { + return; + } + + lock (this.regionLock) + { + if (this.requestedRegions == null) + { + this.requestedRegions = new List(capacity: 4); + } + + this.requestedRegions.Add(new RequestedRegion(regionName, reason)); + + if (reason == RequestedRegionReason.Hedging) + { + // Intentionally written inside regionLock so the flag flip is atomic + // with the Add above. See the field-level comment on hedgingStarted. + this.hedgingStarted = true; + } + } + } + + /// + /// Appends a responded-region entry. No-op if is + /// null or empty. Duplicates are allowed by design (see internal-spec §3.1 + /// "Duplicates ARE allowed and ARE expected"). + /// + internal void AppendResponded(string regionName) + { + if (string.IsNullOrEmpty(regionName)) + { + return; + } + + lock (this.regionLock) + { + if (this.respondedRegions == null) + { + this.respondedRegions = new List(capacity: 4); + } + + this.respondedRegions.Add(regionName); + } + } + + /// + /// Returns true if at least one + /// entry has been appended for this operation. Read lock-free; safe to call on + /// the diagnostics hot path. See F7 review feedback on PR #5868. + /// + internal bool HedgingStarted => this.hedgingStarted; + + /// + /// Returns a snapshot of the dispatched-region list. Snapshot is taken under the + /// lock; the returned array is independent of subsequent mutations. + /// + internal IReadOnlyList GetRequestedRegionsSnapshot() + { + lock (this.regionLock) + { + if (this.requestedRegions == null || this.requestedRegions.Count == 0) + { + return Array.Empty(); + } + + return this.requestedRegions.ToArray(); + } + } + + /// + /// Returns a snapshot of the responded-region list in arrival order. Duplicates + /// preserved. Snapshot is taken under the lock; the returned array is independent + /// of subsequent mutations. + /// + internal IReadOnlyList GetRespondedRegionsSnapshot() + { + lock (this.regionLock) + { + if (this.respondedRegions == null || this.respondedRegions.Count == 0) + { + return Array.Empty(); + } + + return this.respondedRegions.ToArray(); + } + } + } +} diff --git a/Microsoft.Azure.Cosmos/src/Tracing/TraceData/ClientSideRequestStatisticsTraceDatum.cs b/Microsoft.Azure.Cosmos/src/Tracing/TraceData/ClientSideRequestStatisticsTraceDatum.cs index 9bd89c977a..c5bf045b74 100644 --- a/Microsoft.Azure.Cosmos/src/Tracing/TraceData/ClientSideRequestStatisticsTraceDatum.cs +++ b/Microsoft.Azure.Cosmos/src/Tracing/TraceData/ClientSideRequestStatisticsTraceDatum.cs @@ -269,6 +269,10 @@ public void RecordResponse( this.TraceSummary?.AddRegionContacted(regionName, locationEndpoint); } + // Hedging-Detection API: record the responding region (duplicates allowed and + // expected — see internal-spec §3.1 and AC14). + this.TraceSummary?.HedgingDetectionState?.AppendResponded(regionName); + if (responseStatistics.StoreResult != null && !((HttpStatusCode)responseStatistics.StoreResult.StatusCode).IsSuccess() && !(responseStatistics.StoreResult.StatusCode == StatusCodes.NotFound && responseStatistics.StoreResult.SubStatusCode == SubStatusCodes.Unknown) && !(responseStatistics.StoreResult.StatusCode == StatusCodes.Conflict && responseStatistics.StoreResult.SubStatusCode == SubStatusCodes.Unknown) @@ -357,6 +361,7 @@ public void RecordHttpResponse(HttpRequestMessage request, request.Properties.TryGetValue(HttpRequestRegionNameProperty, out regionName)) { this.TraceSummary.AddRegionContacted(Convert.ToString(regionName), locationEndpoint); + this.TraceSummary.HedgingDetectionState?.AppendResponded(Convert.ToString(regionName)); } this.shallowCopyOfHttpResponseStatistics = null; @@ -388,6 +393,7 @@ public void RecordHttpException(HttpRequestMessage request, request.Properties.TryGetValue(HttpRequestRegionNameProperty, out regionName)) { this.TraceSummary.AddRegionContacted(Convert.ToString(regionName), locationEndpoint); + this.TraceSummary.HedgingDetectionState?.AppendResponded(Convert.ToString(regionName)); } this.shallowCopyOfHttpResponseStatistics = null; diff --git a/Microsoft.Azure.Cosmos/src/Tracing/TraceSummary.cs b/Microsoft.Azure.Cosmos/src/Tracing/TraceSummary.cs index 1923dc8524..04943ef6b7 100644 --- a/Microsoft.Azure.Cosmos/src/Tracing/TraceSummary.cs +++ b/Microsoft.Azure.Cosmos/src/Tracing/TraceSummary.cs @@ -95,5 +95,16 @@ public void AddRegionContacted(string regionName, Uri locationEndpoint) } } + /// + /// Per-operation state backing the Hedging Detection API surface (HedgingStarted / + /// GetRequestedRegions / GetRespondedRegions on ). + /// + /// + /// Lives on so that the entire trace tree for a single + /// operation shares one state instance. Populated at orchestrator dispatch sites + /// and response-handling sites; never serialized into the trace tree. + /// + public HedgingDetectionState HedgingDetectionState { get; } = new HedgingDetectionState(); + } } \ No newline at end of file diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs index 5c2b155120..9f78b227f6 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs @@ -1,4 +1,4 @@ -namespace Microsoft.Azure.Cosmos.Client.Tests +namespace Microsoft.Azure.Cosmos.Client.Tests { using System; using Microsoft.Azure.Cosmos.Routing; @@ -19,7 +19,9 @@ using Microsoft.Azure.Cosmos.Common; using System.Net.Http; using System.Reflection; - using System.Collections.Concurrent; + using System.Collections.Concurrent; + using Microsoft.Azure.Cosmos.Handlers; + using Microsoft.Azure.Cosmos.Tracing; /// /// Tests for @@ -1186,6 +1188,361 @@ public async Task DtxRequest_WithBody_DeferredToOuterLoop(int statusCode, int su "CRP's inner retry budget must NOT have been consumed by the deferred body-bearing calls; an empty-body response should still trigger an inner retry."); } + /// + /// Hedging-Detection invariant: when a hedge arm has already been tagged with + /// by + /// CrossRegionHedgingAvailabilityStrategy.CloneAndSendAsync, a subsequent retry + /// on the same cloned request (e.g. 410 Gone, 449) must NOT silently overwrite the + /// dispatch reason with or + /// . Doing so would erase the hedge + /// origin from the GetRequestedRegions() sequence on the second dispatch of + /// the same arm. Pins F3 review feedback on PR #5868. + /// + [TestMethod] + public async Task OnBeforeSendRequest_HedgeArmRetry_PreservesHedgingReason() + { + // Arrange — standard 2-region client (matches the Initialize() defaults). + const bool enableEndpointDiscovery = true; + using GlobalEndpointManager endpointManager = this.Initialize( + useMultipleWriteLocations: false, + enableEndpointDiscovery: enableEndpointDiscovery, + isPreferredLocationsListEmpty: false); + + ClientRetryPolicy retryPolicy = new ClientRetryPolicy( + endpointManager, + this.partitionKeyRangeLocationCache, + new RetryOptions(), + enableEndpointDiscovery, + isThinClientEnabled: false); + + // A hedge-arm dispatch carries a pre-seeded Hedging tag in Properties (this + // mirrors what CrossRegionHedgingAvailabilityStrategy.CloneAndSendAsync does + // for requestNumber > 0). + DocumentServiceRequest request = this.CreateRequest(isReadRequest: true, isMasterResourceType: false); + request.Properties = new Dictionary + { + { + Microsoft.Azure.Cosmos.Tracing.HedgingDetectionState.DispatchReasonPropertyKey, + RequestedRegionReason.Hedging + } + }; + + // First attempt — retryContext is null, so OnBeforeSendRequest must not touch + // the existing Hedging value. + retryPolicy.OnBeforeSendRequest(request); + Assert.IsTrue( + request.Properties.TryGetValue( + Microsoft.Azure.Cosmos.Tracing.HedgingDetectionState.DispatchReasonPropertyKey, + out object firstAttemptReasonObj), + "Hedging tag must survive the first OnBeforeSendRequest call (retryContext == null)."); + Assert.AreEqual(RequestedRegionReason.Hedging, firstAttemptReasonObj); + + // Simulate a 410 Gone on the hedge arm → retryContext gets populated. + // 410/LeaseNotFound flows through ShouldRetryOnUnavailableEndpointStatusCodes + // and yields RetryRequestOnPreferredLocations = true (RegionFailover-shaped). + DocumentClientException leaseNotFound = new DocumentClientException( + message: "LeaseNotFound on hedge arm", + innerException: null, + statusCode: HttpStatusCode.Gone, + substatusCode: SubStatusCodes.LeaseNotFound, + requestUri: request.RequestContext.LocationEndpointToRoute, + responseHeaders: new DictionaryNameValueCollection()); + + ShouldRetryResult retryDecision = await retryPolicy.ShouldRetryAsync(leaseNotFound, CancellationToken.None); + Assert.IsTrue(retryDecision.ShouldRetry, "410/LeaseNotFound must trigger a CRP retry decision."); + + // Critical assertion — second OnBeforeSendRequest (retryContext != null) must + // PRESERVE the existing Hedging tag rather than overwriting it. Without the + // preservation guard this would flip to RegionFailover / OperationRetry, and + // the dispatch site would record the retry of the hedge arm as a same-region + // retry, losing the hedge origin entirely. + retryPolicy.OnBeforeSendRequest(request); + Assert.IsTrue( + request.Properties.TryGetValue( + Microsoft.Azure.Cosmos.Tracing.HedgingDetectionState.DispatchReasonPropertyKey, + out object retryReasonObj), + "Hedging tag must still be present on Properties after the hedge-arm retry."); + Assert.AreEqual( + RequestedRegionReason.Hedging, + retryReasonObj, + "Hedge-arm retry must NOT overwrite Hedging with RegionFailover / OperationRetry — see F3 on PR #5868."); + } + + /// + /// Companion to : + /// when the existing tag is NOT (e.g. it + /// is the OperationRetry value written by a previous retry), the policy MUST overwrite + /// it with the new retry reason. The preservation guard is strictly scoped to + /// . + /// + [TestMethod] + public async Task OnBeforeSendRequest_NonHedgeRetry_OverwritesPreviousReason() + { + const bool enableEndpointDiscovery = true; + using GlobalEndpointManager endpointManager = this.Initialize( + useMultipleWriteLocations: false, + enableEndpointDiscovery: enableEndpointDiscovery, + isPreferredLocationsListEmpty: false); + + ClientRetryPolicy retryPolicy = new ClientRetryPolicy( + endpointManager, + this.partitionKeyRangeLocationCache, + new RetryOptions(), + enableEndpointDiscovery, + isThinClientEnabled: false); + + DocumentServiceRequest request = this.CreateRequest(isReadRequest: true, isMasterResourceType: false); + request.Properties = new Dictionary + { + { + Microsoft.Azure.Cosmos.Tracing.HedgingDetectionState.DispatchReasonPropertyKey, + RequestedRegionReason.OperationRetry + } + }; + + // First attempt (retryContext == null): nothing overwritten. + retryPolicy.OnBeforeSendRequest(request); + Assert.AreEqual( + RequestedRegionReason.OperationRetry, + request.Properties[Microsoft.Azure.Cosmos.Tracing.HedgingDetectionState.DispatchReasonPropertyKey]); + + // Drive a 410/LeaseNotFound → ShouldRetryOnUnavailableEndpointStatusCodes + // sets retryContext.RetryRequestOnPreferredLocations = true. + DocumentClientException leaseNotFound = new DocumentClientException( + message: "LeaseNotFound", + innerException: null, + statusCode: HttpStatusCode.Gone, + substatusCode: SubStatusCodes.LeaseNotFound, + requestUri: request.RequestContext.LocationEndpointToRoute, + responseHeaders: new DictionaryNameValueCollection()); + + ShouldRetryResult retryDecision = await retryPolicy.ShouldRetryAsync(leaseNotFound, CancellationToken.None); + Assert.IsTrue(retryDecision.ShouldRetry); + + // Now the policy SHOULD overwrite OperationRetry → RegionFailover. + retryPolicy.OnBeforeSendRequest(request); + Assert.AreEqual( + RequestedRegionReason.RegionFailover, + request.Properties[Microsoft.Azure.Cosmos.Tracing.HedgingDetectionState.DispatchReasonPropertyKey], + "Non-Hedging tag must be overwritten by the policy's new retry reason."); + } + + /// + /// Pins the F3 fix on : when + /// the resolved dispatch reason is , + /// the dispatch site MUST NOT remove the property from + /// DocumentServiceRequest.Properties. Subsequent physical retries of the + /// same hedge arm (driven by via + /// OnBeforeSendRequest on the same cloned RequestMessage) need to + /// observe the existing Hedging tag so the preservation guard in + /// can keep the reason as + /// Hedging rather than overwriting it with OperationRetry / RegionFailover. + /// + /// Pins F3 review feedback on PR #5868: without this carve-out the F3 preservation + /// guard is dead code in production because TransportHandler drains the shared + /// Properties dictionary before the retry-driven re-entry of OnBeforeSendRequest + /// can observe it. + /// + [TestMethod] + public void AppendDispatchedRegion_HedgingReason_LeavesPropertyForRetry() + { + using GlobalEndpointManager endpointManager = this.Initialize( + useMultipleWriteLocations: false, + enableEndpointDiscovery: true, + isPreferredLocationsListEmpty: false); + + using ITrace rootTrace = Trace.GetRootTrace("AppendDispatchedRegion_HedgingReason"); + using RequestMessage requestMessage = new RequestMessage( + HttpMethod.Get, + "/dbs/db/colls/coll/docs/id", + rootTrace) + { + ResourceType = ResourceType.Document, + OperationType = OperationType.Read, + }; + + DocumentServiceRequest serviceRequest = this.CreateRequest(isReadRequest: true, isMasterResourceType: false); + serviceRequest.RequestContext.RouteToLocation(ClientRetryPolicyTests.Location1Endpoint); + serviceRequest.Properties = new Dictionary + { + { + HedgingDetectionState.DispatchReasonPropertyKey, + RequestedRegionReason.Hedging + } + }; + + TransportHandler.AppendDispatchedRegion(requestMessage, serviceRequest, endpointManager); + + // The Hedging entry must have been recorded ... + IReadOnlyList regions = rootTrace.Summary.HedgingDetectionState.GetRequestedRegionsSnapshot(); + Assert.AreEqual(1, regions.Count, "AppendRequested should have recorded exactly one region."); + Assert.AreEqual(RequestedRegionReason.Hedging, regions[0].Reason); + + // ... AND the property must remain so subsequent retries can observe it + // (this is the F3 carve-out). + Assert.IsTrue( + serviceRequest.Properties.ContainsKey(HedgingDetectionState.DispatchReasonPropertyKey), + "Hedging property must remain on Properties after AppendDispatchedRegion so the F3 preservation guard in ClientRetryPolicy can keep it on the next physical retry of this hedge arm."); + Assert.AreEqual( + RequestedRegionReason.Hedging, + serviceRequest.Properties[HedgingDetectionState.DispatchReasonPropertyKey], + "Hedging property value must be unchanged."); + } + + /// + /// Inverse of : + /// when the resolved reason is NOT Hedging (e.g. OperationRetry written by + /// on a same-region retry), the dispatch site MUST + /// consume the property so a subsequent dispatch without a fresh upstream tag + /// defaults to rather than re-using + /// the stale OperationRetry / RegionFailover value. Pins the original F5 behavior + /// for non-Hedging reasons. + /// + [TestMethod] + public void AppendDispatchedRegion_NonHedgingReason_RemovesPropertyAfterConsume() + { + using GlobalEndpointManager endpointManager = this.Initialize( + useMultipleWriteLocations: false, + enableEndpointDiscovery: true, + isPreferredLocationsListEmpty: false); + + using ITrace rootTrace = Trace.GetRootTrace("AppendDispatchedRegion_NonHedgingReason"); + using RequestMessage requestMessage = new RequestMessage( + HttpMethod.Get, + "/dbs/db/colls/coll/docs/id", + rootTrace) + { + ResourceType = ResourceType.Document, + OperationType = OperationType.Read, + }; + + DocumentServiceRequest serviceRequest = this.CreateRequest(isReadRequest: true, isMasterResourceType: false); + serviceRequest.RequestContext.RouteToLocation(ClientRetryPolicyTests.Location1Endpoint); + serviceRequest.Properties = new Dictionary + { + { + HedgingDetectionState.DispatchReasonPropertyKey, + RequestedRegionReason.OperationRetry + } + }; + + TransportHandler.AppendDispatchedRegion(requestMessage, serviceRequest, endpointManager); + + IReadOnlyList regions = rootTrace.Summary.HedgingDetectionState.GetRequestedRegionsSnapshot(); + Assert.AreEqual(1, regions.Count); + Assert.AreEqual(RequestedRegionReason.OperationRetry, regions[0].Reason); + + Assert.IsFalse( + serviceRequest.Properties.ContainsKey(HedgingDetectionState.DispatchReasonPropertyKey), + "Non-Hedging property must be removed after AppendDispatchedRegion so subsequent dispatches default to Initial unless explicitly re-tagged."); + } + + /// + /// End-to-end production-order test for F3 + F5: drives the exact sequence the + /// hedge-arm retry path takes inside : + /// + /// Strategy seeds Properties[KEY] = Hedging. + /// OnBeforeSendRequest (first attempt, retryContext == null) leaves it alone. + /// AppendDispatchedRegion consumes it. With the F3 fix this LEAVES the + /// property on Properties because the reason was Hedging. + /// 410 Gone -> ShouldRetryAsync populates retryContext. + /// OnBeforeSendRequest (retry, retryContext != null) MUST see Hedging + /// on Properties and preserve it via the F3 guard. + /// AppendDispatchedRegion on the retry records the second physical + /// dispatch as Hedging, not as RegionFailover. + /// + /// Without the F3 fix in , + /// the property would be drained at step 3 and the retry would be silently + /// recorded as RegionFailover, defeating the F3 guard in + /// . + /// + [TestMethod] + public async Task HedgeArmRetry_ProductionOrder_RecordsBothPhysicalAttemptsAsHedging() + { + using GlobalEndpointManager endpointManager = this.Initialize( + useMultipleWriteLocations: false, + enableEndpointDiscovery: true, + isPreferredLocationsListEmpty: false); + + ClientRetryPolicy retryPolicy = new ClientRetryPolicy( + endpointManager, + this.partitionKeyRangeLocationCache, + new RetryOptions(), + enableEndpointDiscovery: true, + isThinClientEnabled: false); + + using ITrace rootTrace = Trace.GetRootTrace("HedgeArmRetry_ProductionOrder"); + using RequestMessage requestMessage = new RequestMessage( + HttpMethod.Get, + "/dbs/db/colls/coll/docs/id", + rootTrace) + { + ResourceType = ResourceType.Document, + OperationType = OperationType.Read, + }; + + // Step 1: hedge orchestrator pre-seeds Properties[KEY] = Hedging on the + // cloned RequestMessage before it enters the pipeline. + requestMessage.Properties[HedgingDetectionState.DispatchReasonPropertyKey] = + RequestedRegionReason.Hedging; + + // Hand-build a DSR whose Properties is the SAME reference as + // requestMessage.Properties — this mirrors RequestMessage.ToDocumentServiceRequest() + // line 302 (`serviceRequest.Properties = this.Properties`) and is essential + // for reproducing the bug: TransportHandler.Remove on serviceRequest.Properties + // also mutates requestMessage.Properties. + DocumentServiceRequest serviceRequest = this.CreateRequest(isReadRequest: true, isMasterResourceType: false); + serviceRequest.Properties = requestMessage.Properties; + serviceRequest.RequestContext.RouteToLocation(ClientRetryPolicyTests.Location1Endpoint); + + // Step 2: first OnBeforeSendRequest (retryContext == null). The Hedging tag + // must survive. + retryPolicy.OnBeforeSendRequest(serviceRequest); + Assert.AreEqual( + RequestedRegionReason.Hedging, + serviceRequest.Properties[HedgingDetectionState.DispatchReasonPropertyKey], + "First OnBeforeSendRequest (retryContext == null) must not touch the Hedging tag."); + + // Step 3: AppendDispatchedRegion consumes the property (records the first + // physical hedge dispatch). With the F3 fix the property remains. + TransportHandler.AppendDispatchedRegion(requestMessage, serviceRequest, endpointManager); + Assert.IsTrue( + serviceRequest.Properties.ContainsKey(HedgingDetectionState.DispatchReasonPropertyKey), + "F3 fix: Hedging property must survive TransportHandler.AppendDispatchedRegion so the next retry's OnBeforeSendRequest can preserve it."); + + // Step 4: 410 Gone -> ShouldRetryAsync populates retryContext. + DocumentClientException leaseNotFound = new DocumentClientException( + message: "LeaseNotFound on hedge arm", + innerException: null, + statusCode: HttpStatusCode.Gone, + substatusCode: SubStatusCodes.LeaseNotFound, + requestUri: serviceRequest.RequestContext.LocationEndpointToRoute, + responseHeaders: new DictionaryNameValueCollection()); + ShouldRetryResult retryDecision = await retryPolicy.ShouldRetryAsync(leaseNotFound, CancellationToken.None); + Assert.IsTrue(retryDecision.ShouldRetry, "410/LeaseNotFound must trigger a CRP retry decision on a hedge arm."); + + // Step 5: second OnBeforeSendRequest (retryContext != null). The F3 guard + // MUST observe Hedging on Properties and preserve it. + retryPolicy.OnBeforeSendRequest(serviceRequest); + Assert.AreEqual( + RequestedRegionReason.Hedging, + serviceRequest.Properties[HedgingDetectionState.DispatchReasonPropertyKey], + "F3 guard must preserve Hedging on the retry-side OnBeforeSendRequest — otherwise the hedge origin is silently overwritten with RegionFailover."); + + // Step 6: AppendDispatchedRegion on the retry records the second physical + // attempt — also as Hedging. + TransportHandler.AppendDispatchedRegion(requestMessage, serviceRequest, endpointManager); + + IReadOnlyList regions = rootTrace.Summary.HedgingDetectionState.GetRequestedRegionsSnapshot(); + Assert.AreEqual(2, regions.Count, "Both physical dispatches of the hedge arm should have been recorded."); + Assert.IsTrue( + regions.All(r => r.Reason == RequestedRegionReason.Hedging), + $"Both physical retries of the hedge arm must be recorded as Hedging, not RegionFailover. Actual reasons: [{string.Join(", ", regions.Select(r => r.Reason))}]."); + Assert.IsTrue( + rootTrace.Summary.HedgingDetectionState.HedgingStarted, + "HedgingStarted must be true after a Hedging entry has been appended."); + } + private static DocumentServiceRequest CreateDtxRequest() { return DocumentServiceRequest.Create( diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Contracts/DotNetSDKAPI.net6.json b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Contracts/DotNetSDKAPI.net6.json index ef91e75c2d..fddac1ce7a 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Contracts/DotNetSDKAPI.net6.json +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Contracts/DotNetSDKAPI.net6.json @@ -3760,6 +3760,11 @@ "Microsoft.Azure.Cosmos.CosmosDiagnostics;System.Object;IsAbstract:True;IsSealed:False;IsInterface:False;IsEnum:False;IsClass:True;IsValueType:False;IsNested:False;IsGenericType:False;IsSerializable:False": { "Subclasses": {}, "Members": { + "Boolean HedgingStarted()": { + "Type": "Method", + "Attributes": [], + "MethodInfo": "Boolean HedgingStarted();IsAbstract:False;IsStatic:False;IsVirtual:True;IsGenericMethod:False;IsConstructor:False;IsFinal:False;" + }, "Int32 GetFailedRequestCount()": { "Type": "Method", "Attributes": [], @@ -3770,6 +3775,16 @@ "Attributes": [], "MethodInfo": "Microsoft.Azure.Cosmos.ServerSideCumulativeMetrics GetQueryMetrics();IsAbstract:False;IsStatic:False;IsVirtual:True;IsGenericMethod:False;IsConstructor:False;IsFinal:False;" }, + "System.Collections.Generic.IReadOnlyList`1[Microsoft.Azure.Cosmos.RequestedRegion] GetRequestedRegions()": { + "Type": "Method", + "Attributes": [], + "MethodInfo": "System.Collections.Generic.IReadOnlyList`1[Microsoft.Azure.Cosmos.RequestedRegion] GetRequestedRegions();IsAbstract:False;IsStatic:False;IsVirtual:True;IsGenericMethod:False;IsConstructor:False;IsFinal:False;" + }, + "System.Collections.Generic.IReadOnlyList`1[System.String] GetRespondedRegions()": { + "Type": "Method", + "Attributes": [], + "MethodInfo": "System.Collections.Generic.IReadOnlyList`1[System.String] GetRespondedRegions();IsAbstract:False;IsStatic:False;IsVirtual:True;IsGenericMethod:False;IsConstructor:False;IsFinal:False;" + }, "System.Collections.Generic.IReadOnlyList`1[System.ValueTuple`2[System.String,System.Uri]] GetContactedRegions()": { "Type": "Method", "Attributes": [], @@ -8730,6 +8745,117 @@ }, "NestedTypes": {} }, + "Microsoft.Azure.Cosmos.RequestedRegion;System.ValueType;IsAbstract:False;IsSealed:True;IsInterface:False;IsEnum:False;IsClass:False;IsValueType:True;IsNested:False;IsGenericType:False;IsSerializable:False": { + "Subclasses": {}, + "Members": { + "Boolean Equals(Microsoft.Azure.Cosmos.RequestedRegion)": { + "Type": "Method", + "Attributes": [], + "MethodInfo": "Boolean Equals(Microsoft.Azure.Cosmos.RequestedRegion);IsAbstract:False;IsStatic:False;IsVirtual:True;IsGenericMethod:False;IsConstructor:False;IsFinal:True;" + }, + "Boolean Equals(System.Object)": { + "Type": "Method", + "Attributes": [], + "MethodInfo": "Boolean Equals(System.Object);IsAbstract:False;IsStatic:False;IsVirtual:True;IsGenericMethod:False;IsConstructor:False;IsFinal:False;" + }, + "Int32 GetHashCode()": { + "Type": "Method", + "Attributes": [], + "MethodInfo": "Int32 GetHashCode();IsAbstract:False;IsStatic:False;IsVirtual:True;IsGenericMethod:False;IsConstructor:False;IsFinal:False;" + }, + "Microsoft.Azure.Cosmos.RequestedRegionReason get_Reason()[System.Runtime.CompilerServices.CompilerGeneratedAttribute()]": { + "Type": "Method", + "Attributes": [ + "CompilerGeneratedAttribute" + ], + "MethodInfo": "Microsoft.Azure.Cosmos.RequestedRegionReason get_Reason();IsAbstract:False;IsStatic:False;IsVirtual:False;IsGenericMethod:False;IsConstructor:False;IsFinal:False;" + }, + "Microsoft.Azure.Cosmos.RequestedRegionReason Reason": { + "Type": "Property", + "Attributes": [], + "MethodInfo": "Microsoft.Azure.Cosmos.RequestedRegionReason Reason;CanRead:True;CanWrite:False;Microsoft.Azure.Cosmos.RequestedRegionReason get_Reason();IsAbstract:False;IsStatic:False;IsVirtual:False;IsGenericMethod:False;IsConstructor:False;IsFinal:False;" + }, + "System.Boolean Microsoft.Azure.Cosmos.RequestedRegion.op_Equality(Microsoft.Azure.Cosmos.RequestedRegion, Microsoft.Azure.Cosmos.RequestedRegion)": { + "Type": "Method", + "Attributes": [], + "MethodInfo": "System.Boolean Microsoft.Azure.Cosmos.RequestedRegion.op_Equality(Microsoft.Azure.Cosmos.RequestedRegion, Microsoft.Azure.Cosmos.RequestedRegion);IsAbstract:False;IsStatic:True;IsVirtual:False;IsGenericMethod:False;IsConstructor:False;IsFinal:False;" + }, + "System.Boolean Microsoft.Azure.Cosmos.RequestedRegion.op_Inequality(Microsoft.Azure.Cosmos.RequestedRegion, Microsoft.Azure.Cosmos.RequestedRegion)": { + "Type": "Method", + "Attributes": [], + "MethodInfo": "System.Boolean Microsoft.Azure.Cosmos.RequestedRegion.op_Inequality(Microsoft.Azure.Cosmos.RequestedRegion, Microsoft.Azure.Cosmos.RequestedRegion);IsAbstract:False;IsStatic:True;IsVirtual:False;IsGenericMethod:False;IsConstructor:False;IsFinal:False;" + }, + "System.String get_RegionName()[System.Runtime.CompilerServices.CompilerGeneratedAttribute()]": { + "Type": "Method", + "Attributes": [ + "CompilerGeneratedAttribute" + ], + "MethodInfo": "System.String get_RegionName();IsAbstract:False;IsStatic:False;IsVirtual:False;IsGenericMethod:False;IsConstructor:False;IsFinal:False;" + }, + "System.String RegionName": { + "Type": "Property", + "Attributes": [], + "MethodInfo": "System.String RegionName;CanRead:True;CanWrite:False;System.String get_RegionName();IsAbstract:False;IsStatic:False;IsVirtual:False;IsGenericMethod:False;IsConstructor:False;IsFinal:False;" + }, + "System.String ToString()": { + "Type": "Method", + "Attributes": [], + "MethodInfo": "System.String ToString();IsAbstract:False;IsStatic:False;IsVirtual:True;IsGenericMethod:False;IsConstructor:False;IsFinal:False;" + }, + "Void .ctor(System.String, Microsoft.Azure.Cosmos.RequestedRegionReason)": { + "Type": "Constructor", + "Attributes": [], + "MethodInfo": "Void .ctor(System.String, Microsoft.Azure.Cosmos.RequestedRegionReason)" + } + }, + "NestedTypes": {} + }, + "Microsoft.Azure.Cosmos.RequestedRegionReason;System.Enum;IsAbstract:False;IsSealed:True;IsInterface:False;IsEnum:True;IsClass:False;IsValueType:True;IsNested:False;IsGenericType:False;IsSerializable:True": { + "Subclasses": {}, + "Members": { + "Byte value__": { + "Type": "Field", + "Attributes": [], + "MethodInfo": "Byte value__;IsInitOnly:False;IsStatic:False;" + }, + "Microsoft.Azure.Cosmos.RequestedRegionReason CircuitBreakerProbe": { + "Type": "Field", + "Attributes": [], + "MethodInfo": "Microsoft.Azure.Cosmos.RequestedRegionReason CircuitBreakerProbe;IsInitOnly:False;IsStatic:True;" + }, + "Microsoft.Azure.Cosmos.RequestedRegionReason Hedging": { + "Type": "Field", + "Attributes": [], + "MethodInfo": "Microsoft.Azure.Cosmos.RequestedRegionReason Hedging;IsInitOnly:False;IsStatic:True;" + }, + "Microsoft.Azure.Cosmos.RequestedRegionReason Initial": { + "Type": "Field", + "Attributes": [], + "MethodInfo": "Microsoft.Azure.Cosmos.RequestedRegionReason Initial;IsInitOnly:False;IsStatic:True;" + }, + "Microsoft.Azure.Cosmos.RequestedRegionReason OperationRetry": { + "Type": "Field", + "Attributes": [], + "MethodInfo": "Microsoft.Azure.Cosmos.RequestedRegionReason OperationRetry;IsInitOnly:False;IsStatic:True;" + }, + "Microsoft.Azure.Cosmos.RequestedRegionReason RegionFailover": { + "Type": "Field", + "Attributes": [], + "MethodInfo": "Microsoft.Azure.Cosmos.RequestedRegionReason RegionFailover;IsInitOnly:False;IsStatic:True;" + }, + "Microsoft.Azure.Cosmos.RequestedRegionReason TransportRetry": { + "Type": "Field", + "Attributes": [], + "MethodInfo": "Microsoft.Azure.Cosmos.RequestedRegionReason TransportRetry;IsInitOnly:False;IsStatic:True;" + }, + "Microsoft.Azure.Cosmos.RequestedRegionReason Unknown": { + "Type": "Field", + "Attributes": [], + "MethodInfo": "Microsoft.Azure.Cosmos.RequestedRegionReason Unknown;IsInitOnly:False;IsStatic:True;" + } + }, + "NestedTypes": {} + }, "Microsoft.Azure.Cosmos.RequestHandler;System.Object;IsAbstract:True;IsSealed:False;IsInterface:False;IsEnum:False;IsClass:True;IsValueType:False;IsNested:False;IsGenericType:False;IsSerializable:False": { "Subclasses": {}, "Members": { diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Diagnostics/CosmosDiagnosticsBackwardCompatTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Diagnostics/CosmosDiagnosticsBackwardCompatTests.cs new file mode 100644 index 0000000000..5b71eb9f97 --- /dev/null +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Diagnostics/CosmosDiagnosticsBackwardCompatTests.cs @@ -0,0 +1,54 @@ +//------------------------------------------------------------ +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------ +namespace Microsoft.Azure.Cosmos.Diagnostics +{ + using System; + using System.Collections.Generic; + using System.IO; + using Microsoft.VisualStudio.TestTools.UnitTesting; + + /// + /// Backward-compatibility tests for the Hedging Detection API additions to + /// . Covers AC9: legacy subclasses that predate + /// the new virtual methods must continue to work without throwing. + /// + [TestClass] + public class CosmosDiagnosticsBackwardCompatTests + { + private sealed class LegacyCosmosDiagnostics : CosmosDiagnostics + { + // Intentionally does NOT override any of the new virtual methods, to + // simulate a customer subclass written before they existed. + public override string ToString() => "legacy"; + public override TimeSpan GetClientElapsedTime() => TimeSpan.Zero; + public override IReadOnlyList<(string regionName, Uri uri)> GetContactedRegions() => + Array.Empty<(string, Uri)>(); + } + + [TestMethod] + public void HedgingStarted_DefaultsToFalse_OnLegacySubclass() + { + CosmosDiagnostics d = new LegacyCosmosDiagnostics(); + Assert.IsFalse(d.HedgingStarted()); + } + + [TestMethod] + public void GetRequestedRegions_DefaultsToEmpty_OnLegacySubclass() + { + CosmosDiagnostics d = new LegacyCosmosDiagnostics(); + IReadOnlyList regions = d.GetRequestedRegions(); + Assert.IsNotNull(regions); + Assert.AreEqual(0, regions.Count); + } + + [TestMethod] + public void GetRespondedRegions_DefaultsToEmpty_OnLegacySubclass() + { + CosmosDiagnostics d = new LegacyCosmosDiagnostics(); + IReadOnlyList regions = d.GetRespondedRegions(); + Assert.IsNotNull(regions); + Assert.AreEqual(0, regions.Count); + } + } +} diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Diagnostics/HedgingDetectionStateTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Diagnostics/HedgingDetectionStateTests.cs new file mode 100644 index 0000000000..fb3a3754dc --- /dev/null +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Diagnostics/HedgingDetectionStateTests.cs @@ -0,0 +1,247 @@ +//------------------------------------------------------------ +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------ +namespace Microsoft.Azure.Cosmos.Diagnostics +{ + using System.Collections.Generic; + using System.Reflection; + using System.Threading; + using System.Threading.Tasks; + using Microsoft.Azure.Cosmos.Tracing; + using Microsoft.VisualStudio.TestTools.UnitTesting; + + [TestClass] + public class HedgingDetectionStateTests + { + [TestMethod] + public void Defaults_AreEmptyAndHedgingFalse() + { + HedgingDetectionState state = new HedgingDetectionState(); + + Assert.IsFalse(state.HedgingStarted); + Assert.AreEqual(0, state.GetRequestedRegionsSnapshot().Count); + Assert.AreEqual(0, state.GetRespondedRegionsSnapshot().Count); + } + + [TestMethod] + public void AppendRequested_NullOrEmptyName_IsIgnored() + { + HedgingDetectionState state = new HedgingDetectionState(); + state.AppendRequested(null, RequestedRegionReason.Initial); + state.AppendRequested(string.Empty, RequestedRegionReason.Hedging); + + Assert.AreEqual(0, state.GetRequestedRegionsSnapshot().Count); + Assert.IsFalse(state.HedgingStarted); + } + + [TestMethod] + public void AppendRequested_Hedging_FlipsHedgingStarted() + { + HedgingDetectionState state = new HedgingDetectionState(); + state.AppendRequested("East US", RequestedRegionReason.Initial); + Assert.IsFalse(state.HedgingStarted); + + state.AppendRequested("West US", RequestedRegionReason.Hedging); + Assert.IsTrue(state.HedgingStarted); + } + + [TestMethod] + public void AppendRequested_NonHedgingReasons_DoNotFlipHedgingStarted() + { + HedgingDetectionState state = new HedgingDetectionState(); + state.AppendRequested("r1", RequestedRegionReason.Initial); + state.AppendRequested("r2", RequestedRegionReason.OperationRetry); + state.AppendRequested("r3", RequestedRegionReason.RegionFailover); + state.AppendRequested("r4", RequestedRegionReason.TransportRetry); + state.AppendRequested("r5", RequestedRegionReason.CircuitBreakerProbe); + + Assert.IsFalse(state.HedgingStarted); + Assert.AreEqual(5, state.GetRequestedRegionsSnapshot().Count); + } + + [TestMethod] + public void AppendRequested_PreservesOrderAndReasonsAndDuplicates() + { + HedgingDetectionState state = new HedgingDetectionState(); + state.AppendRequested("East US", RequestedRegionReason.Initial); + state.AppendRequested("West US", RequestedRegionReason.Hedging); + state.AppendRequested("East US", RequestedRegionReason.OperationRetry); + + IReadOnlyList snap = state.GetRequestedRegionsSnapshot(); + Assert.AreEqual(3, snap.Count); + Assert.AreEqual(new RequestedRegion("East US", RequestedRegionReason.Initial), snap[0]); + Assert.AreEqual(new RequestedRegion("West US", RequestedRegionReason.Hedging), snap[1]); + Assert.AreEqual(new RequestedRegion("East US", RequestedRegionReason.OperationRetry), snap[2]); + } + + [TestMethod] + public void AppendResponded_NullOrEmpty_Ignored_DuplicatesAllowed() + { + HedgingDetectionState state = new HedgingDetectionState(); + state.AppendResponded(null); + state.AppendResponded(string.Empty); + Assert.AreEqual(0, state.GetRespondedRegionsSnapshot().Count); + + state.AppendResponded("East US"); + state.AppendResponded("West US"); + state.AppendResponded("East US"); + + IReadOnlyList snap = state.GetRespondedRegionsSnapshot(); + Assert.AreEqual(3, snap.Count); + CollectionAssert.AreEqual(new[] { "East US", "West US", "East US" }, (System.Collections.ICollection)snap); + } + + [TestMethod] + public void Snapshots_AreIndependentOfSubsequentMutations() + { + HedgingDetectionState state = new HedgingDetectionState(); + state.AppendRequested("East US", RequestedRegionReason.Initial); + state.AppendResponded("East US"); + + IReadOnlyList reqSnap = state.GetRequestedRegionsSnapshot(); + IReadOnlyList respSnap = state.GetRespondedRegionsSnapshot(); + + state.AppendRequested("West US", RequestedRegionReason.Hedging); + state.AppendResponded("West US"); + + Assert.AreEqual(1, reqSnap.Count); + Assert.AreEqual(1, respSnap.Count); + } + + [TestMethod] + public async Task ConcurrentAppends_AreThreadSafe() + { + HedgingDetectionState state = new HedgingDetectionState(); + const int writerCount = 8; + const int perWriter = 500; + + Task[] writers = new Task[writerCount]; + for (int w = 0; w < writerCount; w++) + { + int writerIndex = w; + writers[w] = Task.Run(() => + { + for (int i = 0; i < perWriter; i++) + { + RequestedRegionReason reason = ((writerIndex + i) % 2 == 0) + ? RequestedRegionReason.Initial + : RequestedRegionReason.Hedging; + state.AppendRequested("R" + (i % 3), reason); + state.AppendResponded("R" + (i % 3)); + } + }); + } + + await Task.WhenAll(writers); + + Assert.AreEqual(writerCount * perWriter, state.GetRequestedRegionsSnapshot().Count); + Assert.AreEqual(writerCount * perWriter, state.GetRespondedRegionsSnapshot().Count); + Assert.IsTrue(state.HedgingStarted); + } + + /// + /// Pins the F7 fix on PR #5868: the hedgingStarted backing field MUST be + /// declared volatile so the public HedgingStarted getter can read it + /// without acquiring regionLock. Without volatile, the lock-free read + /// would not see the writer-side flip in a memory-model-correct way and the + /// reviewer-flagged optimization would silently re-introduce a race. + /// + [TestMethod] + public void HedgingStartedBackingField_IsDeclaredVolatile() + { + FieldInfo field = typeof(HedgingDetectionState).GetField( + name: "hedgingStarted", + bindingAttr: BindingFlags.NonPublic | BindingFlags.Instance); + + Assert.IsNotNull(field, "Backing field 'hedgingStarted' must exist on HedgingDetectionState."); + + // The C# 'volatile' modifier surfaces as the IsVolatile required custom modifier + // on the field's type signature. Asserting on this catches an accidental drop of + // the modifier (e.g. during a refactor) without depending on source-level scrape. + System.Type[] requiredMods = field.GetRequiredCustomModifiers(); + bool hasVolatileModifier = false; + foreach (System.Type mod in requiredMods) + { + if (mod == typeof(System.Runtime.CompilerServices.IsVolatile)) + { + hasVolatileModifier = true; + break; + } + } + + Assert.IsTrue( + hasVolatileModifier, + "F7 invariant: 'hedgingStarted' must be 'volatile bool' so the lock-free HedgingStarted getter is memory-model correct."); + } + + /// + /// Functional companion to : + /// readers spinning on HedgingStarted must observe true after a single + /// writer flips it, without ever acquiring regionLock. This catches a regression + /// where someone re-introduces locking on the read path or drops volatile from + /// the backing field — the readers would either deadlock against the writer or never + /// observe the flip. + /// + [TestMethod] + public async Task HedgingStarted_LockFreeRead_ObservesWriterFlip() + { + HedgingDetectionState state = new HedgingDetectionState(); + using CancellationTokenSource cts = new CancellationTokenSource(); + CancellationToken ct = cts.Token; + + // Three readers spin on HedgingStarted concurrently with a single writer that + // performs many AppendRequested calls (each one takes regionLock). If the read + // path is genuinely lock-free, the readers will record the flip moment and exit + // promptly; if it took the lock, they would queue behind the writer and the test + // would take noticeably longer (or, on stricter builds, expose the contention via + // a deadline overrun). + const int readerCount = 3; + Task[] readers = new Task[readerCount]; + for (int r = 0; r < readerCount; r++) + { + readers[r] = Task.Run(() => + { + while (!ct.IsCancellationRequested) + { + if (state.HedgingStarted) + { + return true; + } + } + + return false; + }); + } + + // Writer appends a series of non-Hedging entries first (so readers observe the + // false state across many iterations), then a single Hedging entry to flip the + // flag. + Task writer = Task.Run(() => + { + for (int i = 0; i < 200; i++) + { + state.AppendRequested("R" + (i % 4), RequestedRegionReason.Initial); + } + + state.AppendRequested("West US", RequestedRegionReason.Hedging); + }); + + await writer; + + // Bound the readers; if any reader has not seen true within 5 seconds the + // lock-free invariant is broken or volatile was dropped. + Task allReadersDone = Task.WhenAll(readers); + Task completed = await Task.WhenAny(allReadersDone, Task.Delay(5000, ct)); + cts.Cancel(); + await allReadersDone; + + Assert.AreSame(allReadersDone, completed, "Readers must observe the Hedging flip within the 5-second deadline (F7 lock-free read)."); + foreach (Task reader in readers) + { + Assert.IsTrue(reader.Result, "Every reader must observe HedgingStarted == true after the writer flipped it."); + } + + Assert.IsTrue(state.HedgingStarted, "Post-condition: HedgingStarted remains true (monotonic)."); + } + } +} diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Diagnostics/RequestedRegionTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Diagnostics/RequestedRegionTests.cs new file mode 100644 index 0000000000..7ae9b7274b --- /dev/null +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Diagnostics/RequestedRegionTests.cs @@ -0,0 +1,96 @@ +//------------------------------------------------------------ +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------ +namespace Microsoft.Azure.Cosmos.Diagnostics +{ + using System; + using Microsoft.VisualStudio.TestTools.UnitTesting; + + [TestClass] + public class RequestedRegionTests + { + [TestMethod] + public void Constructor_StoresPropertiesVerbatim() + { + RequestedRegion r = new RequestedRegion("East US", RequestedRegionReason.Hedging); + + Assert.AreEqual("East US", r.RegionName); + Assert.AreEqual(RequestedRegionReason.Hedging, r.Reason); + } + + [TestMethod] + public void Constructor_NullRegionName_Throws() + { + Assert.ThrowsException( + () => new RequestedRegion(null, RequestedRegionReason.Initial)); + } + + [TestMethod] + public void Equality_RegionNameIsCaseInsensitive() + { + RequestedRegion a = new RequestedRegion("East US", RequestedRegionReason.Initial); + RequestedRegion b = new RequestedRegion("east us", RequestedRegionReason.Initial); + + Assert.IsTrue(a.Equals(b)); + Assert.IsTrue(a == b); + Assert.IsFalse(a != b); + Assert.IsTrue(a.Equals((object)b)); + Assert.AreEqual(a.GetHashCode(), b.GetHashCode()); + } + + [TestMethod] + public void Equality_DifferentReason_NotEqual() + { + RequestedRegion a = new RequestedRegion("East US", RequestedRegionReason.Initial); + RequestedRegion b = new RequestedRegion("East US", RequestedRegionReason.Hedging); + + Assert.IsFalse(a.Equals(b)); + Assert.IsTrue(a != b); + Assert.IsFalse(a == b); + } + + [TestMethod] + public void Equals_NonRequestedRegionObject_ReturnsFalse() + { + RequestedRegion r = new RequestedRegion("East US", RequestedRegionReason.Initial); + Assert.IsFalse(r.Equals("East US")); + Assert.IsFalse(r.Equals(null)); + } + + [TestMethod] + public void ToString_FormatsAsRegionColonReason() + { + RequestedRegion r = new RequestedRegion("East US", RequestedRegionReason.Hedging); + Assert.AreEqual("East US:Hedging", r.ToString()); + } + + [TestMethod] + public void DefaultStruct_ReasonIsUnknownSentinel_NotInitial() + { + // Guards against the pre-GA enum layout where Initial = 0 collided with + // default(RequestedRegion). The Unknown sentinel must occupy the underlying-zero + // slot so that struct-default-constructed values (e.g. new RequestedRegion[N], + // uninitialized fields, deserialized values where Reason was absent) are + // distinguishable from a real first dispatch. + RequestedRegion defaultRegion = default; + + Assert.AreEqual(RequestedRegionReason.Unknown, defaultRegion.Reason); + Assert.AreNotEqual(RequestedRegionReason.Initial, defaultRegion.Reason); + Assert.AreEqual(0, (byte)RequestedRegionReason.Unknown); + Assert.IsNull(defaultRegion.RegionName); + } + + [TestMethod] + public void Enum_UnknownIsZeroAndInitialIsOne() + { + // Pin the underlying-byte values so any future renumbering is caught. + Assert.AreEqual(0, (byte)RequestedRegionReason.Unknown); + Assert.AreEqual(1, (byte)RequestedRegionReason.Initial); + Assert.AreEqual(2, (byte)RequestedRegionReason.OperationRetry); + Assert.AreEqual(3, (byte)RequestedRegionReason.TransportRetry); + Assert.AreEqual(4, (byte)RequestedRegionReason.Hedging); + Assert.AreEqual(5, (byte)RequestedRegionReason.RegionFailover); + Assert.AreEqual(6, (byte)RequestedRegionReason.CircuitBreakerProbe); + } + } +} diff --git a/changelog.md b/changelog.md index db34364e9c..280cfa2a38 100644 --- a/changelog.md +++ b/changelog.md @@ -19,6 +19,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 #### Features Added +- [5815](https://github.com/Azure/azure-cosmos-dotnet-v3/pull/5815) Read Consistency Strategy: Adds hub region header for LastCommittedSingleWriteRegion strategy. +- [5848](https://github.com/Azure/azure-cosmos-dotnet-v3/pull/5848) VectorEmbeddingPolicy: Adds EmbeddingSource block to Embedding model +- [5867](https://github.com/Azure/azure-cosmos-dotnet-v3/issues/5867) Diagnostics: Adds hedging detection API (`CosmosDiagnostics.HedgingStarted`, `GetRequestedRegions`, `GetRespondedRegions`) along with the new `RequestedRegion` struct and `RequestedRegionReason` enum so callers can observe per-operation hedging behavior. `RequestedRegionReason.Unknown` (= 0) is the default sentinel for `default(RequestedRegion).Reason` and is never emitted by the SDK; `RequestedRegionReason.CircuitBreakerProbe` and `RequestedRegionReason.TransportRetry` are reserved for the future and not yet populated by this SDK; see issue #5867. - [5600](https://github.com/Azure/azure-cosmos-dotnet-v3/pull/5600) HPK: Adds id to partition key when "/id" is the last path in partition key definition. - [5838](https://github.com/Azure/azure-cosmos-dotnet-v3/pull/5838) EmbeddingGenerator: Adds ICosmosEmbeddingGenerator client-wide configuration (preview) - [5911](https://github.com/Azure/azure-cosmos-dotnet-v3/pull/5911) DistributedTransaction: Adds `DistributedReadTransaction` and `DistributedWriteTransaction` APIs (with `CosmosClient.CreateDistributedReadTransaction` / `CreateDistributedWriteTransaction`) for atomic read and write operations across partitions and containers (preview) diff --git a/openspec/changes/hedging-detection-api/design.md b/openspec/changes/hedging-detection-api/design.md new file mode 100644 index 0000000000..592b83ffc8 --- /dev/null +++ b/openspec/changes/hedging-detection-api/design.md @@ -0,0 +1,160 @@ +# Design — Hedging Detection API + +## Context + +The .NET SDK ships a `CrossRegionHedgingAvailabilityStrategy` that races a +primary request and one or more secondary "hedge arm" requests across +regions. Today, a caller cannot determine whether hedging actually started +for a given operation without serializing the entire diagnostics trace to +JSON and string-matching for a `"Hedge Context"` datum. That is fragile +(format may change), allocation-heavy (deep tree walk + JSON), and not +viable on hot paths. + +This change adds a small, focused public API surface to `CosmosDiagnostics` +that exposes: + +1. Whether the SDK started hedging (`HedgingStarted`). +2. The full list of region dispatches with reason + (`GetRequestedRegions`). +3. The list of regions that actually produced a response + (`GetRespondedRegions`). + +## Decisions + +### 1. State lives off the trace tree + +The new state is stored on a new `HedgingDetectionState` class attached to +`TraceSummary`, **not** as a `TraceDatum` in the trace tree. The +`CosmosDiagnostics.ToString()` JSON shape is therefore unchanged. This +mirrors how `TraceSummary.RegionsContacted` is already populated and read +today. + +### 2. `CosmosTraceDiagnostics` delegates to `TraceSummary.HedgingDetectionState` + +The internal/concrete `CosmosTraceDiagnostics` is constructed lazily at +response time from a finished root `ITrace`. It is not available during the +request lifecycle. State therefore lives on `TraceSummary`, which is +created at the start of the operation and shared across the whole trace +tree. `CosmosTraceDiagnostics` overrides +`HedgingStarted` / `GetRequestedRegions` / `GetRespondedRegions` and forwards +to `this.Value?.Summary?.HedgingDetectionState`. + +### 3. Cross-handler dispatch-reason signaling via `Properties` + +`ClientRetryPolicy` (which knows when a retry is fired) does not have +direct access to the routing endpoint URI used by the dispatch site, and +`CrossRegionHedgingAvailabilityStrategy` runs above `RequestInvokerHandler` +whereas the dispatch site is in `TransportHandler`. To carry the +"why is this dispatch happening" signal across handlers without changing +internal contracts, we use `RequestMessage.Properties` (already a +`Dictionary` that is deep-cloned per hedge arm by +`RequestMessage.Clone`) with a well-known internal key +`HedgingDetectionState.DispatchReasonPropertyKey`. The value is a +`RequestedRegionReason`. `TransportHandler` reads the key (defaults to +`Initial`), records the dispatch, and removes the key so subsequent +retries on the same `RequestMessage` re-default unless explicitly set. + +### 4. Region resolution at dispatch time + +`DocumentServiceRequest.RequestContext.RegionName` is not populated until +the dispatch chain (gateway / address resolver) sets it, which is after +the append site. Instead, the dispatch site resolves the region name +from the routing endpoint URI: + +```csharp +string regionName = globalEndpointManager.GetLocation( + serviceRequest.RequestContext.LocationEndpointToRoute); +``` + +If the URI cannot be resolved to a known region the append is skipped +rather than recording an "unknown" sentinel. + +### 5. Hedge primary tagging + +Within `CrossRegionHedgingAvailabilityStrategy.CloneAndSendAsync`, +`requestNumber == 0` is the primary arm and is left untagged so the +dispatch site records it as `Initial`. `requestNumber > 0` is a hedge arm +and is explicitly tagged `Hedging`. Because hedge arms 1..N are only +launched after their respective threshold delays elapse without primary +cancellation, **no phantom `Hedging` entry is structurally possible** — if +the primary wins under threshold, `CloneAndSendAsync` is never invoked for +the hedge arms. + +### 6. Lock-protected snapshots + +All three accessors (`HedgingStarted`, `GetRequestedRegions`, +`GetRespondedRegions`) acquire the same private lock and return snapshots +via `ToArray()` so callers cannot observe a torn list and cannot mutate +internal state. Internal appends acquire the same lock. + +### 7. Duplicates in responded regions are allowed + +The same physical region may respond more than once for a single operation +(e.g., gateway address response then per-replica responses). The responded +list preserves arrival order and duplicates — matching how +`AddRegionContacted` records contacts today. + +### 8. Reserved-but-unpopulated enum values + +`RequestedRegionReason.TransportRetry` and +`RequestedRegionReason.CircuitBreakerProbe` are part of the public surface +for forward compatibility but are not populated by v1 .NET: + +- **TransportRetry** — per-channel retries inside Direct are not surfaced + at this layer. +- **CircuitBreakerProbe** — PPCB rerouting decisions + (`GlobalPartitionEndpointManagerCore.TryAddPartitionLevelLocationOverride`) + occur inside `storeProxy.ProcessMessageAsync`, after the + pre-dispatch append site. Wiring this requires a flag on + Direct's `DocumentServiceRequestContext`, which is in a separate + closed-source package and tracked as a follow-up. + +XML docs and CHANGELOG note this clearly so callers won't be surprised by +absence. + +## Alternatives Considered + +### A. Add a new `TraceDatum` for hedging state + +Considered and rejected. It would change the +`Diagnostics.ToString()` JSON shape — a *de facto* contract that customers +parse today, which would risk breaking dashboards and logs even though it +isn't documented. The off-tree state on `TraceSummary` is also faster to +read (no tree walk). + +### B. Store directly on `CosmosTraceDiagnostics` + +The published spec sketches the state directly on `CosmosTraceDiagnostics`, +but that class is built lazily from the root trace at response time and +isn't available during the request lifecycle. Putting state on +`TraceSummary` (one per operation, shared across the tree) gives us the +same observable semantics with a natural lifecycle. + +### C. Add a new internal contract for cross-handler signaling + +Considered, but `RequestMessage.Properties` already exists, is already +deep-cloned per hedge arm, and is already shared with +`DocumentServiceRequest.Properties`. Using it avoids a new internal +contract and matches how other features (e.g., partition-key range +override) already piggyback on `Properties`. + +## AC Coverage Matrix + +| AC | Mechanism | +| ---- | --------- | +| AC1 | `HedgingDetectionState.HedgingStarted` flag, flipped on first `RequestedRegionReason.Hedging` append. | +| AC2 | Hedge arm reason set only inside `requestNumber > 0` branch; threshold delay gates the dispatch. | +| AC3 | `TransportHandler` appends to `GetRespondedRegions` via `ClientSideRequestStatisticsTraceDatum`; `HedgingStarted` is false. | +| AC4 | `ClientRetryPolicy.OnBeforeSendRequest` tags `OperationRetry` for same-region retries. | +| AC5 | `ClientRetryPolicy.OnBeforeSendRequest` tags `RegionFailover` when `RetryRequestOnPreferredLocations` is true. | +| AC6 | ⏸ Deferred. `RequestedRegionReason.CircuitBreakerProbe` reserved; wiring requires Direct-package change. | +| AC7 | `RequestedRegionReason.TransportRetry` reserved; per-channel retries not surfaced in v1. | +| AC8 | Hedge winner is appended both to `GetRequestedRegions` (with `Hedging`) and to `GetRespondedRegions`. | +| AC9 | Default virtual implementations on `CosmosDiagnostics` return `false` / empty. Covered by `CosmosDiagnosticsBackwardCompatTests`. | +| AC10 | `IReadOnlyList` is returned for both list accessors; snapshots are immutable arrays. | +| AC11 | State held off the trace tree → `ToString()` JSON shape unchanged. | +| AC12 | New struct (`RequestedRegion`) provides `Equals`/`==`/`GetHashCode`/`ToString` with case-insensitive name. | +| AC13 | No phantom `Hedging` entry — see Decision 5. | +| AC14 | Duplicates in `GetRespondedRegions` are allowed and preserved. | +| AC15 | Lock-protected snapshots; concurrent appends are safe — covered by `HedgingDetectionStateTests.ConcurrentAppends_AreThreadSafe`. | +| AC16 | Live multi-region smoke test runs against an existing CosmosClient with `AvailabilityStrategy` configured. | diff --git a/openspec/changes/hedging-detection-api/proposal.md b/openspec/changes/hedging-detection-api/proposal.md new file mode 100644 index 0000000000..1f6be0a545 --- /dev/null +++ b/openspec/changes/hedging-detection-api/proposal.md @@ -0,0 +1,112 @@ +## Why + +Users of the Azure Cosmos DB .NET SDK who enable cross-region hedging (or any +multi-region availability behavior) today have no first-class API to ask: + +1. *"Did the SDK actually start hedging this operation?"* +2. *"Which regions did the SDK dispatch this operation to, and why?"* +3. *"Which regions responded?"* + +The only currently supported observability path is to parse the JSON returned +by `Diagnostics.ToString()`, which is fragile, allocation-heavy, and not safe +for hot paths (logging, alerting, metrics). The closest existing API, +`Diagnostics.GetContactedRegions()`, only enumerates regions for which a +*response* was recorded and does not distinguish a dispatch reason (initial +attempt vs. retry vs. region failover vs. hedge arm). It also does not surface +"hedging began" as a single boolean. + +This proposal adds a small, focused public API to `CosmosDiagnostics` +covering all three questions, while preserving the existing JSON shape of +`ToString()` so callers continue to render diagnostics the same way. + +Tracking issue: [Azure/azure-cosmos-dotnet-v3#5867](https://github.com/Azure/azure-cosmos-dotnet-v3/issues/5867). + +## What Changes + +- **New `CosmosDiagnostics.HedgingStarted()`** — `virtual bool`, returns + `true` if at least one dispatch on this operation was tagged as a hedge + arm. Default implementation returns `false`. +- **New `CosmosDiagnostics.GetRequestedRegions()`** — `virtual IReadOnlyList`, + returns every region the SDK dispatched to in observed dispatch order, + each tagged with a `RequestedRegionReason`. Default implementation returns + an empty list. +- **New `CosmosDiagnostics.GetRespondedRegions()`** — `virtual IReadOnlyList`, + returns the names of all regions that produced a response, in arrival + order, with duplicates allowed. Default implementation returns an empty + list. +- **New `RequestedRegion`** — public readonly struct with `RegionName`, + `Reason`, `IEquatable` (case-insensitive on name), + `==`/`!=` operators, custom `GetHashCode`, and a `ToString` of + `"{regionName}:{reason}"`. +- **New `RequestedRegionReason`** — public `enum : byte` with values + `Initial`, `OperationRetry`, `RegionFailover`, `Hedging`, + `CircuitBreakerProbe`, `TransportRetry`. `TransportRetry` and + `CircuitBreakerProbe` are reserved for future use and are not populated + by the v1 .NET implementation. +- **Internal `HedgingDetectionState`** — per-trace lock-protected state + attached to `TraceSummary` (the same pattern used today for + `RegionsContacted`). Hosts the three lists/flag, exposes + `AppendRequested(string, RequestedRegionReason)`, + `AppendResponded(string)`, and snapshot getters. Defines an internal + `DispatchReasonPropertyKey` used to signal the dispatch reason from + upstream sites (`ClientRetryPolicy`, + `CrossRegionHedgingAvailabilityStrategy`) to the downstream dispatch + site (`TransportHandler`). +- **Wiring at four sites**: + 1. `CrossRegionHedgingAvailabilityStrategy.CloneAndSendAsync` — set + `Properties[DispatchReasonPropertyKey] = Hedging` for hedge arms + (`requestNumber > 0`). + 2. `ClientRetryPolicy.OnBeforeSendRequest` — after route resolution, + when a retry is in flight, set the reason to `OperationRetry` or + `RegionFailover` based on `retryContext.RetryRequestOnPreferredLocations`. + 3. `TransportHandler.ProcessMessageAsync` — after `ToDocumentServiceRequest`, + resolve the region for `LocationEndpointToRoute` via + `GlobalEndpointManager.GetLocation`, consume the property (default to + `Initial`) and append a `RequestedRegion`. Removes the property so + subsequent retries default unless a new reason is set. + 4. `ClientSideRequestStatisticsTraceDatum` — at the three response-record + sites (Direct response, HTTP response, HTTP exception) append the + responding region name to `HedgingDetectionState`, alongside the + existing `AddRegionContacted` call. + +## Capabilities + +### New Capabilities + +- `hedging-detection`: Public APIs on `CosmosDiagnostics` to efficiently + observe whether the SDK started hedging, the regions it dispatched to and + why, and the regions that responded — without parsing diagnostics strings. + +### Modified Capabilities + + + +## Impact + +- **Public API surface**: three new virtual methods on `CosmosDiagnostics`, + one new readonly struct (`RequestedRegion`), one new enum + (`RequestedRegionReason`). All additions are non-breaking — every new + member is virtual with a safe default, struct equality is well-defined, + enum has reserved values for future expansion. +- **Affected files**: + - `Microsoft.Azure.Cosmos/src/Diagnostics/CosmosDiagnostics.cs` + - `Microsoft.Azure.Cosmos/src/Diagnostics/CosmosTraceDiagnostics.cs` + - `Microsoft.Azure.Cosmos/src/Diagnostics/RequestedRegion.cs` *(new)* + - `Microsoft.Azure.Cosmos/src/Diagnostics/RequestedRegionReason.cs` *(new)* + - `Microsoft.Azure.Cosmos/src/Tracing/HedgingDetectionState.cs` *(new)* + - `Microsoft.Azure.Cosmos/src/Tracing/TraceSummary.cs` + - `Microsoft.Azure.Cosmos/src/Routing/AvailabilityStrategy/CrossRegionHedgingAvailabilityStrategy.cs` + - `Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs` + - `Microsoft.Azure.Cosmos/src/Handler/TransportHandler.cs` + - `Microsoft.Azure.Cosmos/src/Tracing/TraceData/ClientSideRequestStatisticsTraceDatum.cs` +- **Performance**: O(1) per dispatch and per response append (single lock + acquire + list add). State is held off the trace tree, so the JSON shape + produced by `Diagnostics.ToString()` is unchanged. +- **API contract files**: `DotNetSDKAPI.net6.json` updated to include the + three new methods, the new struct, and the new enum. +- **Reserved enum values**: `RequestedRegionReason.TransportRetry` and + `RequestedRegionReason.CircuitBreakerProbe` are reserved for the future + and not populated in v1 — PPCB rerouting decisions happen inside the + closed-source `Microsoft.Azure.Cosmos.Direct` package after the dispatch + append site, and per-channel transport retries inside Direct are not + surfaced at this layer yet. diff --git a/openspec/changes/hedging-detection-api/specs/hedging-detection/spec.md b/openspec/changes/hedging-detection-api/specs/hedging-detection/spec.md new file mode 100644 index 0000000000..cd1586a8c8 --- /dev/null +++ b/openspec/changes/hedging-detection-api/specs/hedging-detection/spec.md @@ -0,0 +1,198 @@ +## ADDED Requirements + +### Requirement: Detect whether hedging started for an operation + +The `CosmosDiagnostics` class SHALL expose a public virtual method +`HedgingStarted()` returning `bool`. The method SHALL return `true` when +the SDK dispatched at least one hedge-arm request for the operation, and +`false` otherwise. + +#### Scenario: Non-hedged single-region request + +- **WHEN** an operation completes without any hedge arm being dispatched +- **THEN** `response.Diagnostics.HedgingStarted()` SHALL return `false` + +#### Scenario: Hedged request where a hedge arm was dispatched + +- **WHEN** an operation activated cross-region hedging AND at least one + hedge arm dispatch was issued before a final result arrived +- **THEN** `response.Diagnostics.HedgingStarted()` SHALL return `true` + +#### Scenario: Hedged-eligible request where the primary won under threshold + +- **WHEN** the cross-region hedging availability strategy is configured AND + the primary region produced a final response before the first hedge + threshold elapsed +- **THEN** no hedge arm was dispatched AND + `response.Diagnostics.HedgingStarted()` SHALL return `false` + +#### Scenario: Default implementation returns false + +- **WHEN** a custom subclass of `CosmosDiagnostics` does not override + `HedgingStarted()` +- **THEN** the default implementation SHALL return `false` + +#### Scenario: O(1) accessor + +- **WHEN** `HedgingStarted()` is called on any `CosmosDiagnostics` instance +- **THEN** the method SHALL return in O(1) time without walking the trace + tree, allocating memory, or performing string operations + +### Requirement: Enumerate dispatched regions with reason + +The `CosmosDiagnostics` class SHALL expose a public virtual method +`GetRequestedRegions()` returning `IReadOnlyList`. The +returned list SHALL contain one entry for every region the SDK dispatched +to during the operation, in observed dispatch order, each tagged with the +reason the SDK chose that region. + +#### Scenario: Single-region happy path + +- **WHEN** an operation runs end-to-end against the primary region with + no retries +- **THEN** `GetRequestedRegions()` SHALL contain exactly one entry tagged + `RequestedRegionReason.Initial` + +#### Scenario: Same-region retry + +- **WHEN** the SDK retries an operation against the same region under the + operation-retry policy +- **THEN** `GetRequestedRegions()` SHALL contain at least two entries: the + initial dispatch (`Initial`) followed by one or more retries tagged + `RequestedRegionReason.OperationRetry` + +#### Scenario: Region failover retry + +- **WHEN** the SDK retries an operation against a different preferred + region after deciding to fail over +- **THEN** the subsequent dispatch SHALL be recorded with reason + `RequestedRegionReason.RegionFailover` + +#### Scenario: Hedge arm dispatch + +- **WHEN** the cross-region hedging strategy dispatches a hedge arm for + this operation +- **THEN** that dispatch SHALL be recorded with reason + `RequestedRegionReason.Hedging` + +#### Scenario: Default implementation returns empty list + +- **WHEN** a custom subclass of `CosmosDiagnostics` does not override + `GetRequestedRegions()` +- **THEN** the default implementation SHALL return an empty + `IReadOnlyList` + +#### Scenario: Snapshot semantics + +- **WHEN** `GetRequestedRegions()` is called concurrently with the request + pipeline still appending entries +- **THEN** the returned list SHALL be a stable snapshot taken under the + internal lock AND subsequent mutations SHALL NOT be observed through + that snapshot + +### Requirement: Enumerate responded regions + +The `CosmosDiagnostics` class SHALL expose a public virtual method +`GetRespondedRegions()` returning `IReadOnlyList`. The returned +list SHALL contain the name of every region that produced a response +during the operation, in arrival order, with duplicates allowed. + +#### Scenario: Single-region happy path + +- **WHEN** an operation completes with one response from the primary + region +- **THEN** `GetRespondedRegions()` SHALL contain `["{primary}"]` + +#### Scenario: Hedge arm wins + +- **WHEN** the hedge arm to a secondary region produces a final response + before the primary +- **THEN** `GetRespondedRegions()` SHALL contain at least the secondary + region name (and SHALL also contain the primary if it produced any + observable response before cancellation) + +#### Scenario: Duplicates are preserved + +- **WHEN** the same region produces multiple responses for one operation + (e.g., the gateway response followed by replica responses) +- **THEN** every response SHALL be recorded; the list SHALL preserve + arrival order + +#### Scenario: Default implementation returns empty list + +- **WHEN** a custom subclass of `CosmosDiagnostics` does not override + `GetRespondedRegions()` +- **THEN** the default implementation SHALL return an empty + `IReadOnlyList` + +### Requirement: `RequestedRegion` value type + +The SDK SHALL expose a public readonly struct `RequestedRegion` with +properties `RegionName` (string) and `Reason` (`RequestedRegionReason`), +implementing `IEquatable`, with case-insensitive name +equality, a `GetHashCode` consistent with `Equals`, `==`/`!=` operators, +and a `ToString` of the form `"{regionName}:{reason}"`. + +#### Scenario: Equality is case-insensitive on region name + +- **WHEN** two `RequestedRegion` values have the same reason and region + names that differ only in casing +- **THEN** `Equals`, `==`, and `GetHashCode` SHALL agree that the two + values are equal + +#### Scenario: Null region name is rejected + +- **WHEN** a caller constructs `RequestedRegion(null, anyReason)` +- **THEN** the constructor SHALL throw `ArgumentNullException` + +### Requirement: `RequestedRegionReason` enumeration + +The SDK SHALL expose a public `enum : byte` named `RequestedRegionReason` +with values `Unknown`, `Initial`, `OperationRetry`, `RegionFailover`, +`Hedging`, `CircuitBreakerProbe`, and `TransportRetry`. `Unknown` SHALL be +the underlying-zero value and SHALL serve as the default sentinel for +`default(RequestedRegionReason)` and `default(RequestedRegion).Reason`. The +SDK SHALL NOT emit `Unknown` from a real dispatch. +`CircuitBreakerProbe` and `TransportRetry` SHALL be reserved for future use +and MAY not be populated by every SDK version; consumers SHALL treat the +enum as non-exhaustive. + +#### Scenario: Reserved values do not appear in v1 .NET + +- **WHEN** an operation completes on the v1 .NET implementation +- **THEN** no entry in `GetRequestedRegions()` SHALL have reason + `CircuitBreakerProbe` or `TransportRetry` + +#### Scenario: `default(RequestedRegion)` is observably distinct from a real dispatch + +- **WHEN** a caller inspects `default(RequestedRegion)` +- **THEN** `Reason` SHALL be `RequestedRegionReason.Unknown` +- **AND** the SDK SHALL NOT produce a `RequestedRegion` with reason + `Unknown` from any real dispatch path + +### Requirement: Diagnostics JSON shape is preserved + +The new hedging-detection state SHALL be held off the trace tree and SHALL +NOT introduce a new `TraceDatum` or otherwise modify the JSON shape +produced by `CosmosDiagnostics.ToString()`. + +#### Scenario: ToString output is unchanged for non-hedged paths + +- **WHEN** an operation completes without hedging on a release that adds + the hedging detection API +- **THEN** the JSON returned by `Diagnostics.ToString()` SHALL be the same + as before the API was added (apart from incidental version stamps) + +### Requirement: Backwards compatibility for `CosmosDiagnostics` subclasses + +A pre-existing customer subclass of `CosmosDiagnostics` that does not +override any of the new virtual methods SHALL continue to compile and run +without throwing when the new methods are invoked. + +#### Scenario: Legacy subclass returns safe defaults + +- **WHEN** a customer subclass of `CosmosDiagnostics` that predates the + new API is exercised +- **THEN** `HedgingStarted()` SHALL return `false`, + `GetRequestedRegions()` SHALL return an empty list, and + `GetRespondedRegions()` SHALL return an empty list diff --git a/openspec/changes/hedging-detection-api/tasks.md b/openspec/changes/hedging-detection-api/tasks.md new file mode 100644 index 0000000000..ff23484f3e --- /dev/null +++ b/openspec/changes/hedging-detection-api/tasks.md @@ -0,0 +1,54 @@ +## 1. Setup + +- [x] 1.1 Create a feature branch `feature/hedging-detection-api` off `main`. +- [x] 1.2 Preserve previous proposal under `feature/hedging-detection-api-legacy-pr5741` for reference. + +## 2. Public API — `CosmosDiagnostics` + +- [x] 2.1 Add `virtual bool HedgingStarted()` with default returning `false` and the SE-013 "dispatched, not necessarily wire-issued" `` paragraph. +- [x] 2.2 Add `virtual IReadOnlyList GetRequestedRegions()` with default returning `Array.Empty()`. +- [x] 2.3 Add `virtual IReadOnlyList GetRespondedRegions()` with default returning `Array.Empty()`. + +## 3. Public types + +- [x] 3.1 Create `RequestedRegion` (public readonly struct, `IEquatable`, case-insensitive name equality, `==`/`!=`, custom `GetHashCode` that does not depend on `System.HashCode`, `ToString` = `"{regionName}:{reason}"`). +- [x] 3.2 Create `RequestedRegionReason` (public `enum : byte`) with values `Unknown` (= 0, default sentinel), `Initial`, `OperationRetry`, `RegionFailover`, `Hedging`, `CircuitBreakerProbe`, `TransportRetry`. Document that `Unknown` is the default sentinel for `default(RequestedRegion).Reason` and is never emitted by the SDK, and that `TransportRetry` and `CircuitBreakerProbe` are reserved and not populated by v1 .NET. + +## 4. Internal state — `HedgingDetectionState` + +- [x] 4.1 Create `Microsoft.Azure.Cosmos.Tracing.HedgingDetectionState` (`sealed`, lock-protected) with `AppendRequested`, `AppendResponded`, `HedgingStarted`, and snapshot getters that return `ToArray()` under the lock. +- [x] 4.2 Add `internal const string DispatchReasonPropertyKey` used as the well-known key on `RequestMessage.Properties` / `DocumentServiceRequest.Properties` to carry the next-dispatch reason across handlers. +- [x] 4.3 Attach a fresh `HedgingDetectionState` to every `TraceSummary` (initialized eagerly). +- [x] 4.4 Add overrides of all three new virtual methods on `CosmosTraceDiagnostics`, delegating to `this.Value?.Summary?.HedgingDetectionState`. + +## 5. Dispatch-site wiring + +- [x] 5.1 In `CrossRegionHedgingAvailabilityStrategy.CloneAndSendAsync`, set `clonedRequest.Properties[DispatchReasonPropertyKey] = RequestedRegionReason.Hedging` for hedge arms (`requestNumber > 0`). Leave the primary arm untagged so it defaults to `Initial`. +- [x] 5.2 In `ClientRetryPolicy.OnBeforeSendRequest`, after `request.RequestContext.RouteToLocation(locationEndpoint)`, when `retryContext != null`, set `request.Properties[DispatchReasonPropertyKey]` to `RegionFailover` if `retryContext.RetryRequestOnPreferredLocations` else `OperationRetry`. Do not override a value already set by the orchestrator. +- [x] 5.3 In `TransportHandler.ProcessMessageAsync`, after `ToDocumentServiceRequest`, resolve the region for `LocationEndpointToRoute` via `GlobalEndpointManager.GetLocation`, read the reason from `serviceRequest.Properties` (default `Initial`), append a `RequestedRegion` to `request.Trace.Summary.HedgingDetectionState`, then remove the key so subsequent retries re-default. + +## 6. Response-site wiring + +- [x] 6.1 In `ClientSideRequestStatisticsTraceDatum.RecordResponse`, alongside the existing `AddRegionContacted` call, also call `HedgingDetectionState.AppendResponded(regionName)`. +- [x] 6.2 In `ClientSideRequestStatisticsTraceDatum.RecordHttpResponse`, mirror the same append. +- [x] 6.3 In `ClientSideRequestStatisticsTraceDatum.RecordHttpException`, mirror the same append. + +## 7. Tests + +- [x] 7.1 `RequestedRegionTests` — equality (case-insensitive), hash, `ToString`, null-name guard, operator coverage. +- [x] 7.2 `HedgingDetectionStateTests` — defaults, null/empty guards, `HedgingStarted` flip on `Hedging`, ordering / duplicates, snapshot independence, thread-safety. +- [x] 7.3 `CosmosDiagnosticsBackwardCompatTests` — legacy `CosmosDiagnostics` subclass that does not override the new virtuals returns safe defaults (AC9). +- [ ] 7.4 Emulator / fault-injection tests covering AC1, AC2, AC3, AC4, AC5, AC8, AC13, AC14, AC15. *(Tracked as follow-up — requires reuse of existing fault-injection infra.)* +- [ ] 7.5 Live multi-region smoke test (AC16). *(Tracked as follow-up — requires a real multi-region account.)* +- [ ] 7.6 Golden-file `ToString` stability test (AC11). *(Tracked as follow-up — existing baseline tests cover most paths.)* + +## 8. Contract / changelog + +- [x] 8.1 Regenerate `DotNetSDKAPI.net6.json` via `UpdateContracts.ps1`; diff contains only the three new methods, the new struct, and the new enum. +- [x] 8.2 Add a `## Unreleased Preview` entry to `changelog.md` referencing issue #5867. + +## 9. Validation + +- [x] 9.1 `dotnet build Microsoft.Azure.Cosmos\src\Microsoft.Azure.Cosmos.csproj -c Debug` — clean. +- [x] 9.2 New unit tests pass — 17 / 17 green on `net6.0`. +- [x] 9.3 GA contract enforcement test passes — 107 / 107 green on `net6.0` Release.