diff --git a/Microsoft.Azure.Cosmos/src/DocumentClient.cs b/Microsoft.Azure.Cosmos/src/DocumentClient.cs index c70b0b8668..347fc00ed3 100644 --- a/Microsoft.Azure.Cosmos/src/DocumentClient.cs +++ b/Microsoft.Azure.Cosmos/src/DocumentClient.cs @@ -136,6 +136,37 @@ internal partial class DocumentClient : IDisposable, IAuthorizationTokenProvider internal readonly AuthorizationTokenProvider cosmosAuthorization; private readonly bool isThinClientFeatureFlagEnabled = ConfigurationManager.IsThinClientEnabled(defaultValue: false); + + // Serializes the (disableCrossRegionalHedging, customerConfiguredAvailabilityStrategy, + // ConnectionPolicy.AvailabilityStrategy) mutation sequence performed by the gateway-driven + // hedging-override reconcile path. + // + // NOTE: This lock is NOT load-bearing under the current set of callers — the GlobalEndpointManager + // serializes its account-properties refreshes, and the init-time stash in + // InitializeGatewayConfigurationReaderAsync runs strictly before the background-refresh loop is + // started and before OnEnablePartitionLevelFailoverConfigChanged is subscribed. So in production + // today there is no real concurrency hazard to defend against here. + // + // The lock exists for two narrower reasons: + // 1. Internal test accessors (DisableCrossRegionalHedgingForTests, + // CustomerConfiguredAvailabilityStrategyForTests) can read/write the same state directly + // from arbitrary test threads, so the accessors and the reconcile path need to agree on + // a single mutex. + // 2. Future-proofing: UpdatePartitionLevelFailoverConfigWithAccountRefresh and + // ApplyHedgingStrategyForCurrentState are internal/private but reachable from any new + // caller that wires them up. If a future change ever invokes them off the GEM-serialized + // path, the stash/clear/restore sequence stays atomic instead of regressing silently. + // + // Per-request reads from RequestInvokerHandler intentionally do NOT take this lock: + // • disableCrossRegionalHedging is declared volatile (acquire semantics on read, release on + // write), which is sufficient for the per-request "is the gateway kill-switch on?" check. + // • ConnectionPolicy.AvailabilityStrategy is a reference field; reference assignment is atomic + // on the CLR, so a request observes either the pre- or post-transition strategy, never a + // half-applied state. + // Skipping the lock keeps the hot path on RequestInvokerHandler.AvailabilityStrategy(...) + // monitor-free. + private readonly object hedgingStrategyLock = new object(); + internal bool isThinClientEnabled; // Gateway has backoff/retry logic to hide transient errors. @@ -174,6 +205,23 @@ internal partial class DocumentClient : IDisposable, IAuthorizationTokenProvider private bool isDisposing; private bool isDisposed; + // Gateway-controlled override that disables all cross-regional hedging for PPAF accounts. + // Mirrors AccountProperties.DisableCrossRegionalHedging from the most recent account-properties refresh. + // + // Declared volatile so per-request reads from RequestInvokerHandler (via IsHedgingDisabledByGateway) + // and the init-time read in InitializePartitionLevelFailoverWithDefaultHedging do not need to take + // hedgingStrategyLock — volatile gives acquire semantics on the read and release semantics on the + // write, which is exactly what the "atomic kill-switch" contract requires. The lock continues to + // serialize the multi-field (flag + stashed strategy + ConnectionPolicy.AvailabilityStrategy) + // mutation sequence on the refresh / init-stash paths. + private volatile bool disableCrossRegionalHedging; + + // When the gateway disable flag is true, the customer's explicit AvailabilityStrategy + // (if any) is stashed here so it can be restored verbatim if the flag is later toggled back to false. + // Null when the customer never configured a strategy or when no stash is currently held. + // Mutated only under hedgingStrategyLock. + private AvailabilityStrategy customerConfiguredAvailabilityStrategy; + // creator of TransportClient is responsible for disposing it. private IStoreClientFactory storeClientFactory; internal CosmosHttpClient httpClient { get; private set; } @@ -6868,8 +6916,30 @@ private async Task InitializeGatewayConfigurationReaderAsync() await this.accountServiceConfiguration.InitializeAsync(); AccountProperties accountProperties = this.accountServiceConfiguration.AccountProperties; this.UseMultipleWriteLocations = this.ConnectionPolicy.UseMultipleWriteLocations && accountProperties.EnableMultipleWriteLocations; - this.GlobalEndpointManager.InitializeAccountPropertiesAndStartBackgroundRefresh(accountProperties); + + // Capture the initial gateway disableCrossRegionalHedging flag and stash any customer-configured + // AvailabilityStrategy so it can be restored if the flag is later toggled back to false. + // This must run BEFORE the GEM background refresh loop is started and BEFORE the change-event + // handler is subscribed: if either happens first, a refresh-driven event firing concurrently with + // initialization could otherwise be overwritten here, leaving the SDK with stale state and a + // mismatched GEM baseline that would never re-fire. + if (!this.ConnectionPolicy.DisablePartitionLevelFailoverClientLevelOverride) + { + lock (this.hedgingStrategyLock) + { + this.disableCrossRegionalHedging = accountProperties.DisableCrossRegionalHedging ?? false; + if (this.disableCrossRegionalHedging && this.ConnectionPolicy.AvailabilityStrategy != null) + { + this.customerConfiguredAvailabilityStrategy = this.ConnectionPolicy.AvailabilityStrategy; + this.ConnectionPolicy.AvailabilityStrategy = null; + DefaultTrace.TraceInformation( + "DocumentClient: Hedging disabled at initialization by Gateway property disableCrossRegionalHedging=true"); + } + } + } + this.GlobalEndpointManager.OnEnablePartitionLevelFailoverConfigChanged += this.UpdatePartitionLevelFailoverConfigWithAccountRefresh; + this.GlobalEndpointManager.InitializeAccountPropertiesAndStartBackgroundRefresh(accountProperties); } internal string GetUserAgentFeatures() @@ -6900,6 +6970,13 @@ internal string GetUserAgentFeatures() internal void InitializePartitionLevelFailoverWithDefaultHedging() { + if (this.disableCrossRegionalHedging) + { + DefaultTrace.TraceInformation( + "DocumentClient: Skipping default PPAF hedging because Gateway property disableCrossRegionalHedging=true"); + return; + } + if (this.ConnectionPolicy.EnablePartitionLevelFailover && this.ConnectionPolicy.AvailabilityStrategy == null) { @@ -6926,8 +7003,9 @@ internal void InitializePartitionLevelFailoverWithDefaultHedging() } } - private void UpdatePartitionLevelFailoverConfigWithAccountRefresh( - bool isEnabled) + internal void UpdatePartitionLevelFailoverConfigWithAccountRefresh( + bool latestIsEnabled, + bool latestDisableCrossRegionalHedging) { // Only update if client-level override is not disabled if (this.ConnectionPolicy.DisablePartitionLevelFailoverClientLevelOverride) @@ -6936,36 +7014,228 @@ private void UpdatePartitionLevelFailoverConfigWithAccountRefresh( return; } - DefaultTrace.TraceInformation( - "DocumentClient: PPAF Account Level Config Updated. Updating EnablePartitionLevelFailover to {0}", - isEnabled); + lock (this.hedgingStrategyLock) + { + bool ppafEnablementChanged = this.ConnectionPolicy.EnablePartitionLevelFailover != latestIsEnabled; + bool hedgingFlagChanged = this.disableCrossRegionalHedging != latestDisableCrossRegionalHedging; + + // No-op when nothing has actually changed. + // + // In production this branch is unreachable: GlobalEndpointManager's + // RefreshLocationAsync only fires OnEnablePartitionLevelFailoverConfigChanged when + // either EnablePartitionLevelFailover or lastKnownDisableCrossRegionalHedging + // transitions, so the callback is invoked only on a real change. + // + // The guard is defense-in-depth for direct callers of this internal method — + // primarily unit tests that exercise the reconcile logic without going through the + // GEM event, and any future caller that wires up its own invocation. Without it, + // ApplyHedgingStrategyForCurrentState would still run on a no-change call and could + // clear an SDK-default strategy that was correctly installed. + if (!ppafEnablementChanged && !hedgingFlagChanged) + { + return; + } + + if (ppafEnablementChanged) + { + DefaultTrace.TraceInformation( + "DocumentClient: PPAF Account Level Config Updated. Updating EnablePartitionLevelFailover to {0}", + latestIsEnabled); - // Step 1: Enable partition level failover. - this.PartitionKeyRangeLocation.SetIsPPAFEnabled(isEnabled); - this.ConnectionPolicy.EnablePartitionLevelFailover = isEnabled; + // Step 1: Enable partition level failover. + this.PartitionKeyRangeLocation.SetIsPPAFEnabled(latestIsEnabled); + this.ConnectionPolicy.EnablePartitionLevelFailover = latestIsEnabled; - // Step 2: Enable partition level circuit breaker. - this.PartitionKeyRangeLocation.SetIsPPCBEnabled(isEnabled); - this.ConnectionPolicy.EnablePartitionLevelCircuitBreaker = isEnabled; + // Step 2: Enable partition level circuit breaker. + this.PartitionKeyRangeLocation.SetIsPPCBEnabled(latestIsEnabled); + this.ConnectionPolicy.EnablePartitionLevelCircuitBreaker = latestIsEnabled; + } - // Step 3: Enable default hedging strategy if partition level failover is enabled. - if (isEnabled && this.ConnectionPolicy.AvailabilityStrategy == null) - { - this.InitializePartitionLevelFailoverWithDefaultHedging(); + if (hedgingFlagChanged) + { + DefaultTrace.TraceInformation( + "DocumentClient: Gateway disableCrossRegionalHedging flag changed to {0}", + latestDisableCrossRegionalHedging); + this.disableCrossRegionalHedging = latestDisableCrossRegionalHedging; + } + + // Step 3: Reconcile the AvailabilityStrategy with the latest account state. + // + // Note: this call is intentionally outside the `if (hedgingFlagChanged)` block above + // because reconciliation is also required when PPAF enablement toggles without the + // hedging flag changing. Specifically: + // • PPAF transitioned off → drop the SDK-default strategy we previously installed. + // • PPAF transitioned on with no customer strategy → install the SDK default. + // The early-return at the top of the method already guarantees we get here only when + // at least one of (ppafEnablementChanged, hedgingFlagChanged) is true, so this call + // is never wasted. The gateway disable flag has the highest precedence — when true, + // hedging is OFF regardless of any explicit or default configuration. + this.ApplyHedgingStrategyForCurrentState(); + + if (ppafEnablementChanged) + { + // Step 4: Update the user agent features. Hedging-flag-only changes do not affect the + // PPAF-related user-agent feature flags. + this.ConnectionPolicy.UserAgentContainer.AppendFeatures(this.GetUserAgentFeatures()); + + DefaultTrace.TraceInformation("DocumentClient: Successfully updated PPAF configuration dynamically"); + } + + if (hedgingFlagChanged) + { + DefaultTrace.TraceInformation( + "DocumentClient: Successfully reconciled hedging strategy dynamically (disableCrossRegionalHedging={0})", + latestDisableCrossRegionalHedging); + } } - else + } + + /// + /// Reconciles with the current values of + /// and . + /// + /// + /// Precedence (highest first): + /// 1. Gateway disableCrossRegionalHedging = true: stash any non-default strategy and clear it. + /// 2. Customer explicitly configured a strategy: keep / restore that strategy. + /// 3. PPAF enabled with no explicit strategy: apply SDK default PPAF hedging. + /// 4. PPAF disabled: clear any SDK-default strategy that we previously installed. + /// Caller must hold ; the entry-point lock is reentrant so this + /// method also acquires it for direct invocations from tests / future callers. + /// + private void ApplyHedgingStrategyForCurrentState() + { + lock (this.hedgingStrategyLock) { - if (this.ConnectionPolicy.AvailabilityStrategy is CrossRegionHedgingAvailabilityStrategy strategy && strategy.IsSDKDefaultStrategyForPPAF) + if (this.disableCrossRegionalHedging) + { + AvailabilityStrategy currentStrategy = this.ConnectionPolicy.AvailabilityStrategy; + if (currentStrategy != null) + { + bool currentIsSdkDefault = currentStrategy is CrossRegionHedgingAvailabilityStrategy hedging + && hedging.IsSDKDefaultStrategyForPPAF; + + // Only stash customer-configured strategies. The SDK default can be regenerated + // deterministically from PPAF state, so re-stashing it would only cause confusion + // when reconciling later. + if (!currentIsSdkDefault) + { + if (this.customerConfiguredAvailabilityStrategy == null) + { + this.customerConfiguredAvailabilityStrategy = currentStrategy; + } + else if (!ReferenceEquals(this.customerConfiguredAvailabilityStrategy, currentStrategy)) + { + // A previously-stashed customer strategy is still held while a different + // non-default strategy is currently on ConnectionPolicy. Silently dropping + // the new strategy would lose customer configuration, so surface this loudly + // — it indicates a re-entrant or otherwise unexpected code path. + DefaultTrace.TraceWarning( + "DocumentClient: ApplyHedgingStrategyForCurrentState observed a non-default " + + "AvailabilityStrategy while a previously-stashed customer strategy is still held; " + + "the current strategy will be cleared without being stashed. This may indicate a " + + "re-entrant code path."); + } + } + + this.ConnectionPolicy.AvailabilityStrategy = null; + DefaultTrace.TraceInformation( + "DocumentClient: Hedging disabled by Gateway property disableCrossRegionalHedging=true"); + } + return; + } + + // disableCrossRegionalHedging == false: restore or rebuild the appropriate strategy. + if (this.customerConfiguredAvailabilityStrategy != null) + { + this.ConnectionPolicy.AvailabilityStrategy = this.customerConfiguredAvailabilityStrategy; + this.customerConfiguredAvailabilityStrategy = null; + DefaultTrace.TraceInformation( + "DocumentClient: Hedging re-enabled — restored customer-configured AvailabilityStrategy"); + return; + } + + if (this.ConnectionPolicy.EnablePartitionLevelFailover && this.ConnectionPolicy.AvailabilityStrategy == null) + { + this.InitializePartitionLevelFailoverWithDefaultHedging(); + if (this.ConnectionPolicy.AvailabilityStrategy != null) + { + DefaultTrace.TraceInformation( + "DocumentClient: Hedging re-enabled — applied SDK default PPAF hedging strategy"); + } + return; + } + + if (!this.ConnectionPolicy.EnablePartitionLevelFailover + && this.ConnectionPolicy.AvailabilityStrategy is CrossRegionHedgingAvailabilityStrategy sdkDefault + && sdkDefault.IsSDKDefaultStrategyForPPAF) { - // If the user has not set a custom availability strategy, then we will reset it to null. + // PPAF disabled — drop the SDK default we previously installed. this.ConnectionPolicy.AvailabilityStrategy = null; } } + } - // Step 4: Update the user agent features. - this.ConnectionPolicy.UserAgentContainer.AppendFeatures(this.GetUserAgentFeatures()); + /// + /// Lock-free atomic read of the cached Gateway disableCrossRegionalHedging flag. Used by + /// on every + /// request to enforce the operator-override precedence over per-request and client-level strategy: + /// when the Gateway flag is true, hedging is OFF for every request on this client, + /// regardless of where the strategy was configured. + /// + /// + /// Safe to read without because the backing field is declared + /// volatile — the read has acquire semantics, so any write that completed on the refresh + /// path before its lock release is visible here. Taking a monitor on every request would impose + /// uncontended-but-non-zero cost on the SDK hot path (steady-state benchmarks would not surface + /// the regression). + /// + internal bool IsHedgingDisabledByGateway => this.disableCrossRegionalHedging; + + // Test-only accessors. Visible to the unit-test assembly via [InternalsVisibleTo] in + // Microsoft.Azure.Cosmos.csproj. + // + // Why these exist rather than mocking AccountProperties: + // • The "real-time" path that populates this state runs through + // CosmosAccountServiceConfiguration.InitializeAsync → GatewayAccountReader.InitializeReaderAsync, + // which currently performs a real HTTPS account-properties read against the configured + // endpoint and has no injection seam for a fake account snapshot. Pre-populating these + // fields via AccountProperties is therefore not achievable from a pure unit test today. + // • These accessors let the unit tests pin the (disableCrossRegionalHedging, + // customerConfiguredAvailabilityStrategy) precondition directly and then invoke the + // reconcile entry points (UpdatePartitionLevelFailoverConfigWithAccountRefresh, + // ApplyHedgingStrategyForCurrentState) to verify the transition logic without any I/O. + // • They are preferred over System.Reflection because renames or refactors of the backing + // fields are caught at compile time, rather than blowing up at test runtime with + // NullReferenceException from a stale FieldInfo cache. + // + // Once a proper IGatewayAccountReader / IAccountServiceConfiguration mocking seam is + // introduced these accessors should be removed in favor of mocking the account snapshot. + internal bool DisableCrossRegionalHedgingForTests + { + // Read is lock-free because disableCrossRegionalHedging is volatile (see field declaration). + // The setter retains the lock so a test write happens-before any subsequent observation on the + // reconcile path that mutates the companion fields (customerConfiguredAvailabilityStrategy, + // ConnectionPolicy.AvailabilityStrategy) under the same lock. + get => this.disableCrossRegionalHedging; + set + { + lock (this.hedgingStrategyLock) + { + this.disableCrossRegionalHedging = value; + } + } + } - DefaultTrace.TraceInformation("DocumentClient: Successfully updated PPAF configuration dynamically"); + internal AvailabilityStrategy CustomerConfiguredAvailabilityStrategyForTests + { + get + { + lock (this.hedgingStrategyLock) + { + return this.customerConfiguredAvailabilityStrategy; + } + } } internal void CaptureSessionToken(DocumentServiceRequest request, DocumentServiceResponse response) diff --git a/Microsoft.Azure.Cosmos/src/Handler/RequestInvokerHandler.cs b/Microsoft.Azure.Cosmos/src/Handler/RequestInvokerHandler.cs index 168e280bfd..a0ff12adf4 100644 --- a/Microsoft.Azure.Cosmos/src/Handler/RequestInvokerHandler.cs +++ b/Microsoft.Azure.Cosmos/src/Handler/RequestInvokerHandler.cs @@ -124,12 +124,24 @@ public override async Task SendAsync( /// /// This method determines if there is an availability strategy that the request can use. - /// Note that the request level availability strategy options override the client level options. + /// Note that the request level availability strategy options override the client level options, + /// but the Gateway-driven operator override () + /// takes absolute precedence over both — when the Gateway flag + /// disableCrossRegionalHedging is true, hedging is OFF for every request on this + /// client regardless of where the strategy was configured. /// /// /// whether the request should be a parallel hedging request. public AvailabilityStrategyInternal AvailabilityStrategy(RequestMessage request) { + // Gateway-driven operator override has absolute precedence over any request-level or + // client-level AvailabilityStrategy. See spec.md → "Gateway flag disables all hedging + // when true" and tasks.md item 4.1. + if (this.client.DocumentClient.IsHedgingDisabledByGateway) + { + return null; + } + AvailabilityStrategy strategy = request.RequestOptions?.AvailabilityStrategy ?? this.client.DocumentClient.ConnectionPolicy.AvailabilityStrategy; diff --git a/Microsoft.Azure.Cosmos/src/Resource/Settings/AccountProperties.cs b/Microsoft.Azure.Cosmos/src/Resource/Settings/AccountProperties.cs index d0c80c6700..096268a89c 100644 --- a/Microsoft.Azure.Cosmos/src/Resource/Settings/AccountProperties.cs +++ b/Microsoft.Azure.Cosmos/src/Resource/Settings/AccountProperties.cs @@ -248,6 +248,23 @@ internal long ProvisionedDocumentStorageInMB [JsonProperty(PropertyName = Constants.Properties.EnablePerPartitionFailoverBehavior)] internal bool? EnablePartitionLevelFailover { get; set; } + /// + /// Gets the gateway-controlled override that disables cross-regional hedging for this account. + /// + /// + /// When this flag is , the SDK disables all hedging (both SDK-default PPAF + /// hedging and any explicit customer-configured ) regardless + /// of any other configuration. When the flag is or + /// (absent from the Gateway response), existing hedging behavior is preserved. The flag is intended + /// as an operational escape hatch and is not exposed through any public SDK API surface. + /// + // TODO: The JSON property name is hard-coded here because the corresponding constant has not yet been + // published in the Microsoft.Azure.Cosmos.Direct package referenced by this SDK. Once Direct is updated + // to expose this name on Constants.Properties, refactor this attribute to read from + // Constants.Properties. for consistency with the other AccountProperties JSON bindings. + [JsonProperty(PropertyName = "disableCrossRegionalHedging")] + internal bool? DisableCrossRegionalHedging { get; set; } + private IDictionary QueryStringToDictConverter() { if (!string.IsNullOrEmpty(this.QueryEngineConfigurationString)) diff --git a/Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs b/Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs index 8cadc2f6e8..ed88cf4166 100644 --- a/Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs +++ b/Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs @@ -41,12 +41,23 @@ internal class GlobalEndpointManager : IGlobalEndpointManager private readonly object isAccountRefreshInProgressLock = new object(); private bool isAccountRefreshInProgress = false; private bool isBackgroundAccountRefreshActive = false; - private DateTime LastBackgroundRefreshUtc = DateTime.MinValue; - - /// - /// Event that is raised when PPAF (Per Partition Automatic Failover) enablement status changes - /// - internal event Action? OnEnablePartitionLevelFailoverConfigChanged; + private DateTime LastBackgroundRefreshUtc = DateTime.MinValue; + + // Last observed value of the account-level disableCrossRegionalHedging flag. + // Tracked separately so the change event fires when only this flag toggles. + private bool lastKnownDisableCrossRegionalHedging = false; + + /// + /// Event that is raised when PPAF (Per Partition Automatic Failover) enablement status changes + /// or when the gateway-controlled disableCrossRegionalHedging flag toggles. + /// + /// + /// First argument is the latest EnablePartitionLevelFailover value observed from the + /// Gateway (falls back to the existing connection-policy value when the property is absent). + /// Second argument is the latest disableCrossRegionalHedging value (false when absent + /// from the Gateway response). + /// + internal event Action? OnEnablePartitionLevelFailoverConfigChanged; public GlobalEndpointManager( IDocumentClientInternal owner, @@ -608,10 +619,14 @@ public virtual void InitializeAccountPropertiesAndStartBackgroundRefresh(Account return; } - if (!this.connectionPolicy.DisablePartitionLevelFailoverClientLevelOverride && databaseAccount.EnablePartitionLevelFailover.HasValue) - { - this.connectionPolicy.EnablePartitionLevelFailover = databaseAccount.EnablePartitionLevelFailover.Value; - } + if (!this.connectionPolicy.DisablePartitionLevelFailoverClientLevelOverride && databaseAccount.EnablePartitionLevelFailover.HasValue) + { + this.connectionPolicy.EnablePartitionLevelFailover = databaseAccount.EnablePartitionLevelFailover.Value; + } + + // Capture initial disableCrossRegionalHedging baseline so the change-event only fires on + // subsequent transitions, not on the first observation. + this.lastKnownDisableCrossRegionalHedging = databaseAccount.DisableCrossRegionalHedging ?? false; GlobalEndpointManager.ParseThinClientLocationsFromAdditionalProperties(databaseAccount); @@ -768,13 +783,39 @@ private async Task RefreshDatabaseAccountInternalAsync(bool forceRefresh) try { this.LastBackgroundRefreshUtc = DateTime.UtcNow; - AccountProperties accountProperties = await this.GetDatabaseAccountAsync(true); - - if (!this.connectionPolicy.DisablePartitionLevelFailoverClientLevelOverride - && accountProperties.EnablePartitionLevelFailover.HasValue - && (this.connectionPolicy.EnablePartitionLevelFailover != accountProperties.EnablePartitionLevelFailover.Value)) - { - this.OnEnablePartitionLevelFailoverConfigChanged?.Invoke(accountProperties.EnablePartitionLevelFailover.Value); + AccountProperties accountProperties = await this.GetDatabaseAccountAsync(true); + + bool ignorePpafChanges = this.connectionPolicy.DisablePartitionLevelFailoverClientLevelOverride; + + bool ppafEnablementChanged = !ignorePpafChanges + && accountProperties.EnablePartitionLevelFailover.HasValue + && (this.connectionPolicy.EnablePartitionLevelFailover != accountProperties.EnablePartitionLevelFailover.Value); + + // Hedging change-detection mirrors the PPAF .HasValue guard above: + // a missing property in the response is "no signal", NOT an implicit false. + // This prevents a transient gateway response that drops the property + // (e.g., partial regional failover, stale gateway version) from being + // interpreted as a true -> false transition that re-enables hedging + // during the very window the operator most wants it disabled. + // + // Runbook contract: on-call disables via an explicit "false" property + // value, not by removing the property override. + bool disableHedgingFlagChanged = !ignorePpafChanges + && accountProperties.DisableCrossRegionalHedging.HasValue + && (accountProperties.DisableCrossRegionalHedging.Value != this.lastKnownDisableCrossRegionalHedging); + + if (ppafEnablementChanged || disableHedgingFlagChanged) + { + bool latestPpafEnabled = accountProperties.EnablePartitionLevelFailover + ?? this.connectionPolicy.EnablePartitionLevelFailover; + + // Only advance lastKnown when the gateway emitted an explicit value; otherwise + // preserve the cached value so a later property-restored response diffs against + // the previously-honored state (rather than against an implicit false baseline). + bool latestDisableHedging = accountProperties.DisableCrossRegionalHedging + ?? this.lastKnownDisableCrossRegionalHedging; + this.lastKnownDisableCrossRegionalHedging = latestDisableHedging; + this.OnEnablePartitionLevelFailoverConfigChanged?.Invoke(latestPpafEnabled, latestDisableHedging); } GlobalEndpointManager.ParseThinClientLocationsFromAdditionalProperties(accountProperties); diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosJsonSerializerUnitTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosJsonSerializerUnitTests.cs index 3b4b9c1887..3fb1733fb2 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosJsonSerializerUnitTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosJsonSerializerUnitTests.cs @@ -67,7 +67,7 @@ public void ValidatePropertySerialization() string id = "testId"; this.TestProperty( id, - $@"{{""id"":""{id}"",""writableLocations"":[],""readableLocations"":[],""userConsistencyPolicy"":null,""addresses"":null,""userReplicationPolicy"":null,""systemReplicationPolicy"":null,""readPolicy"":null,""queryEngineConfiguration"":null,""enableMultipleWriteLocations"":false,""enablePerPartitionFailoverBehavior"":null,""enableNRegionSynchronousCommit"":false}}"); + $@"{{""id"":""{id}"",""writableLocations"":[],""readableLocations"":[],""userConsistencyPolicy"":null,""addresses"":null,""userReplicationPolicy"":null,""systemReplicationPolicy"":null,""readPolicy"":null,""queryEngineConfiguration"":null,""enableMultipleWriteLocations"":false,""enablePerPartitionFailoverBehavior"":null,""disableCrossRegionalHedging"":null,""enableNRegionSynchronousCommit"":false}}"); this.TestProperty( id, diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosSerializerCoreTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosSerializerCoreTests.cs index 00f00156fc..f865894418 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosSerializerCoreTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosSerializerCoreTests.cs @@ -143,7 +143,7 @@ public void ValidateCustomSerializerNotUsedForInternalTypes() this.TestProperty( serializerCore, id, - $@"{{""id"":""{id}"",""writableLocations"":[],""readableLocations"":[],""userConsistencyPolicy"":null,""addresses"":null,""userReplicationPolicy"":null,""systemReplicationPolicy"":null,""readPolicy"":null,""queryEngineConfiguration"":null,""enableMultipleWriteLocations"":false,""enablePerPartitionFailoverBehavior"":null,""enableNRegionSynchronousCommit"":false}}"); + $@"{{""id"":""{id}"",""writableLocations"":[],""readableLocations"":[],""userConsistencyPolicy"":null,""addresses"":null,""userReplicationPolicy"":null,""systemReplicationPolicy"":null,""readPolicy"":null,""queryEngineConfiguration"":null,""enableMultipleWriteLocations"":false,""enablePerPartitionFailoverBehavior"":null,""disableCrossRegionalHedging"":null,""enableNRegionSynchronousCommit"":false}}"); this.TestProperty( serializerCore, diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GatewayHedgingOverrideTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GatewayHedgingOverrideTests.cs new file mode 100644 index 0000000000..77f2d1b9e5 --- /dev/null +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GatewayHedgingOverrideTests.cs @@ -0,0 +1,367 @@ +namespace Microsoft.Azure.Cosmos.Tests +{ + using System; + using System.Net.Http; + using Microsoft.Azure.Cosmos; + using Microsoft.Azure.Cosmos.Handlers; + using Microsoft.Azure.Documents; + using Microsoft.VisualStudio.TestTools.UnitTesting; + using Newtonsoft.Json; + + /// + /// Unit tests for the Gateway-controlled disableCrossRegionalHedging flag on + /// and the resulting hedging strategy reconciliation in + /// . + /// + [TestClass] + public class GatewayHedgingOverrideTests + { + private const string AccountEndpoint = "https://localhost:8081/"; + private const string AccountKey = "C2y6yDjf5/R+ob0N8A7Cgv30VRDJIWEHLM+4QDU5DE2nQ9nDuVTqobD4b8mGGyPMbIZnqyMsEcaGQy67XIw/Jw=="; + + [TestMethod] + public void AccountProperties_DisableCrossRegionalHedging_True_Deserializes() + { + string json = "{\"id\":\"test\",\"disableCrossRegionalHedging\":true}"; + AccountProperties props = JsonConvert.DeserializeObject(json); + Assert.IsTrue(props.DisableCrossRegionalHedging.HasValue); + Assert.IsTrue(props.DisableCrossRegionalHedging.Value); + } + + [TestMethod] + public void AccountProperties_DisableCrossRegionalHedging_False_Deserializes() + { + string json = "{\"id\":\"test\",\"disableCrossRegionalHedging\":false}"; + AccountProperties props = JsonConvert.DeserializeObject(json); + Assert.IsTrue(props.DisableCrossRegionalHedging.HasValue); + Assert.IsFalse(props.DisableCrossRegionalHedging.Value); + } + + [TestMethod] + public void AccountProperties_DisableCrossRegionalHedging_Absent_DefaultsToNull() + { + string json = "{\"id\":\"test\"}"; + AccountProperties props = JsonConvert.DeserializeObject(json); + Assert.IsFalse(props.DisableCrossRegionalHedging.HasValue); + } + + [TestMethod] + public void AccountProperties_DisableCrossRegionalHedging_Unknown_FieldDoesNotBreakDeserialization() + { + string json = "{\"id\":\"test\",\"disableCrossRegionalHedging\":true,\"someFutureFlag\":\"foo\"}"; + AccountProperties props = JsonConvert.DeserializeObject(json); + Assert.IsTrue(props.DisableCrossRegionalHedging.GetValueOrDefault()); + } + + [TestMethod] + public void InitializePartitionLevelFailoverWithDefaultHedging_FlagTrue_SkipsApplyingDefaultStrategy() + { + DocumentClient client = CreateClient(new ConnectionPolicy { EnablePartitionLevelFailover = true }); + try + { + Assert.IsNull(client.ConnectionPolicy.AvailabilityStrategy, "Pre-condition: no strategy configured"); + + SetDisableHedgingFlag(client, true); + client.InitializePartitionLevelFailoverWithDefaultHedging(); + + Assert.IsNull( + client.ConnectionPolicy.AvailabilityStrategy, + "Default hedging must NOT be applied when disableCrossRegionalHedging=true"); + } + finally + { + client.Dispose(); + } + } + + [TestMethod] + public void InitializePartitionLevelFailoverWithDefaultHedging_FlagFalse_AppliesDefaultStrategy() + { + DocumentClient client = CreateClient(new ConnectionPolicy { EnablePartitionLevelFailover = true }); + try + { + SetDisableHedgingFlag(client, false); + client.InitializePartitionLevelFailoverWithDefaultHedging(); + + Assert.IsNotNull( + client.ConnectionPolicy.AvailabilityStrategy, + "Default PPAF hedging must be applied when disableCrossRegionalHedging=false"); + } + finally + { + client.Dispose(); + } + } + + /// + /// Regression test for the init-path race called out on PR #5829: in production, + /// is invoked + /// from DocumentClient.cs line 1108 — i.e. after + /// InitializeGatewayConfigurationReaderAsync subscribes the + /// + /// handler and starts the background account-properties refresh loop. A refresh that fires in + /// that narrow window can flip disableCrossRegionalHedging via + /// + /// before the init thread reaches the default-hedging block. Without acquire/release semantics + /// on the flag field, the init-path read could observe a stale value and silently install the + /// SDK default hedging strategy after the operator just disabled hedging. + /// + /// Companion to : + /// that test pins the flag through the synthetic test setter; this one exercises the actual + /// refresh code path that the race scenario depends on. + /// + [TestMethod] + public void InitializePartitionLevelFailoverWithDefaultHedging_AfterRefreshDrivenFlagFlip_SkipsApplyingDefaultStrategy() + { + DocumentClient client = CreateClient(new ConnectionPolicy { EnablePartitionLevelFailover = true }); + try + { + Assert.IsNull(client.ConnectionPolicy.AvailabilityStrategy, "Pre-condition: no strategy configured"); + Assert.IsFalse(client.DisableCrossRegionalHedgingForTests, "Pre-condition: flag starts at default false"); + + // Simulate a refresh-driven flag flip — this is exactly what GEM's background-refresh + // loop calls when AccountProperties.DisableCrossRegionalHedging transitions, and it is + // the write that the init-path read at DocumentClient.cs:6963 races against. + client.UpdatePartitionLevelFailoverConfigWithAccountRefresh( + latestIsEnabled: true, + latestDisableCrossRegionalHedging: true); + + Assert.IsTrue( + client.DisableCrossRegionalHedgingForTests, + "Refresh callback must have published the flag transition"); + + // Now exercise the init-path read. With volatile semantics on the field, the read here + // has acquire semantics and is guaranteed to observe the published value above — and + // therefore must skip applying the SDK default hedging strategy. + client.InitializePartitionLevelFailoverWithDefaultHedging(); + + Assert.IsNull( + client.ConnectionPolicy.AvailabilityStrategy, + "Init-path read of disableCrossRegionalHedging must observe the post-refresh value " + + "and skip applying the SDK default hedging strategy"); + } + finally + { + client.Dispose(); + } + } + + [TestMethod] + public void UpdateConfig_FlagToggleOn_StashesCustomerStrategyAndClearsAvailabilityStrategy() + { + ConnectionPolicy policy = new ConnectionPolicy { EnablePartitionLevelFailover = true }; + CrossRegionHedgingAvailabilityStrategy customerStrategy = new CrossRegionHedgingAvailabilityStrategy( + threshold: TimeSpan.FromMilliseconds(500), + thresholdStep: TimeSpan.FromMilliseconds(100)); + policy.AvailabilityStrategy = customerStrategy; + + DocumentClient client = CreateClient(policy); + try + { + client.UpdatePartitionLevelFailoverConfigWithAccountRefresh(latestIsEnabled: true, latestDisableCrossRegionalHedging: true); + + Assert.IsNull(client.ConnectionPolicy.AvailabilityStrategy, "Strategy must be cleared when disable flag flips to true"); + Assert.AreSame( + customerStrategy, + GetCustomerStashedStrategy(client), + "Customer-configured strategy must be stashed for later restoration"); + } + finally + { + client.Dispose(); + } + } + + [TestMethod] + public void UpdateConfig_FlagToggleOff_RestoresStashedCustomerStrategy() + { + ConnectionPolicy policy = new ConnectionPolicy { EnablePartitionLevelFailover = true }; + CrossRegionHedgingAvailabilityStrategy customerStrategy = new CrossRegionHedgingAvailabilityStrategy( + threshold: TimeSpan.FromMilliseconds(500), + thresholdStep: TimeSpan.FromMilliseconds(100)); + policy.AvailabilityStrategy = customerStrategy; + + DocumentClient client = CreateClient(policy); + try + { + // Step 1: disable hedging — strategy is stashed. + client.UpdatePartitionLevelFailoverConfigWithAccountRefresh(latestIsEnabled: true, latestDisableCrossRegionalHedging: true); + Assert.IsNull(client.ConnectionPolicy.AvailabilityStrategy); + + // Step 2: re-enable hedging — strategy is restored. + client.UpdatePartitionLevelFailoverConfigWithAccountRefresh(latestIsEnabled: true, latestDisableCrossRegionalHedging: false); + + Assert.AreSame( + customerStrategy, + client.ConnectionPolicy.AvailabilityStrategy, + "Customer-configured strategy must be restored when the disable flag toggles back to false"); + Assert.IsNull(GetCustomerStashedStrategy(client), "Stash must be cleared after restoration"); + } + finally + { + client.Dispose(); + } + } + + [TestMethod] + public void UpdateConfig_ClientLevelOverrideDisabled_FlagIsIgnored() + { + ConnectionPolicy policy = new ConnectionPolicy + { + EnablePartitionLevelFailover = true, + DisablePartitionLevelFailoverClientLevelOverride = true, + }; + CrossRegionHedgingAvailabilityStrategy customerStrategy = new CrossRegionHedgingAvailabilityStrategy( + threshold: TimeSpan.FromMilliseconds(500), + thresholdStep: TimeSpan.FromMilliseconds(100)); + policy.AvailabilityStrategy = customerStrategy; + + DocumentClient client = CreateClient(policy); + try + { + client.UpdatePartitionLevelFailoverConfigWithAccountRefresh(latestIsEnabled: true, latestDisableCrossRegionalHedging: true); + + Assert.AreSame( + customerStrategy, + client.ConnectionPolicy.AvailabilityStrategy, + "When DisablePartitionLevelFailoverClientLevelOverride=true, the gateway flag must be ignored entirely"); + } + finally + { + client.Dispose(); + } + } + + [TestMethod] + public void UpdateConfig_FlagTrue_DoesNotStashSDKDefaultStrategy() + { + ConnectionPolicy policy = new ConnectionPolicy { EnablePartitionLevelFailover = true }; + DocumentClient client = CreateClient(policy); + try + { + // Apply SDK default first. + client.InitializePartitionLevelFailoverWithDefaultHedging(); + Assert.IsNotNull(client.ConnectionPolicy.AvailabilityStrategy); + Assert.IsTrue(((CrossRegionHedgingAvailabilityStrategy)client.ConnectionPolicy.AvailabilityStrategy).IsSDKDefaultStrategyForPPAF); + + // Now flip the flag — the SDK default should be cleared but NOT stashed. + client.UpdatePartitionLevelFailoverConfigWithAccountRefresh(latestIsEnabled: true, latestDisableCrossRegionalHedging: true); + + Assert.IsNull(client.ConnectionPolicy.AvailabilityStrategy); + Assert.IsNull( + GetCustomerStashedStrategy(client), + "SDK-default strategy must NOT be stashed — it can be regenerated deterministically"); + + // Toggle back off — SDK default is rebuilt from PPAF state. + client.UpdatePartitionLevelFailoverConfigWithAccountRefresh(latestIsEnabled: true, latestDisableCrossRegionalHedging: false); + + Assert.IsNotNull(client.ConnectionPolicy.AvailabilityStrategy); + Assert.IsTrue(((CrossRegionHedgingAvailabilityStrategy)client.ConnectionPolicy.AvailabilityStrategy).IsSDKDefaultStrategyForPPAF); + } + finally + { + client.Dispose(); + } + } + + /// + /// Regression test: when the Gateway flag is true on the underlying + /// , + /// MUST return null even if the request itself supplies a per-request + /// via . + /// This is the absolute-precedence guarantee called out in the spec — the operator's + /// kill-switch wins over both per-request and client-level configuration. + /// + [TestMethod] + public void RequestInvokerHandler_FlagTrue_PerRequestStrategy_ReturnsNull() + { + using CosmosClient mockCosmosClient = MockCosmosUtil.CreateMockCosmosClient(); + mockCosmosClient.DocumentClient.DisableCrossRegionalHedgingForTests = true; + + RequestInvokerHandler handler = new RequestInvokerHandler( + mockCosmosClient, + requestedClientConsistencyLevel: null, + requestedClientReadConsistencyStrategy: null, + requestedClientPriorityLevel: null, + requestedClientThroughputBucket: null); + + using RequestMessage request = new RequestMessage( + HttpMethod.Get, + new Uri("/dbs/testdb/colls/testcontainer/docs/testId", UriKind.Relative)) + { + ResourceType = ResourceType.Document, + OperationType = OperationType.Read, + RequestOptions = new RequestOptions + { + AvailabilityStrategy = new CrossRegionHedgingAvailabilityStrategy( + threshold: TimeSpan.FromMilliseconds(500), + thresholdStep: TimeSpan.FromMilliseconds(100)) + } + }; + + AvailabilityStrategyInternal resolved = handler.AvailabilityStrategy(request); + + Assert.IsNull( + resolved, + "Gateway operator override must take absolute precedence over per-request AvailabilityStrategy"); + } + + /// + /// Companion to : + /// when the Gateway flag is false, the per-request + /// MUST be honored — the override only suppresses hedging while the flag is true. + /// + [TestMethod] + public void RequestInvokerHandler_FlagFalse_HonorsPerRequestStrategy() + { + using CosmosClient mockCosmosClient = MockCosmosUtil.CreateMockCosmosClient(); + mockCosmosClient.DocumentClient.DisableCrossRegionalHedgingForTests = false; + + RequestInvokerHandler handler = new RequestInvokerHandler( + mockCosmosClient, + requestedClientConsistencyLevel: null, + requestedClientReadConsistencyStrategy: null, + requestedClientPriorityLevel: null, + requestedClientThroughputBucket: null); + + CrossRegionHedgingAvailabilityStrategy perRequestStrategy = new CrossRegionHedgingAvailabilityStrategy( + threshold: TimeSpan.FromMilliseconds(500), + thresholdStep: TimeSpan.FromMilliseconds(100)); + + using RequestMessage request = new RequestMessage( + HttpMethod.Get, + new Uri("/dbs/testdb/colls/testcontainer/docs/testId", UriKind.Relative)) + { + ResourceType = ResourceType.Document, + OperationType = OperationType.Read, + RequestOptions = new RequestOptions + { + AvailabilityStrategy = perRequestStrategy + } + }; + + AvailabilityStrategyInternal resolved = handler.AvailabilityStrategy(request); + + Assert.IsNotNull(resolved, "Per-request strategy must be honored when Gateway flag is false"); + Assert.AreSame( + perRequestStrategy, + resolved, + "Per-request strategy must be returned verbatim when Gateway flag is false"); + } + + private static DocumentClient CreateClient(ConnectionPolicy policy) + { + return new DocumentClient(new Uri(AccountEndpoint), AccountKey, policy); + } + + private static void SetDisableHedgingFlag(DocumentClient client, bool value) + { + client.DisableCrossRegionalHedgingForTests = value; + } + + private static AvailabilityStrategy GetCustomerStashedStrategy(DocumentClient client) + { + return client.CustomerConfiguredAvailabilityStrategyForTests; + } + } +} diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GlobalEndpointManagerTest.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GlobalEndpointManagerTest.cs index c61aa1c756..e88a4ea591 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GlobalEndpointManagerTest.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GlobalEndpointManagerTest.cs @@ -542,6 +542,123 @@ void TraceHandler(string message) Assert.IsTrue(isExceptionLogged, "The exception was logged as a warning trace event."); } + /// + /// Regression test for the operator-only kill-switch property-drop scenario. + /// + /// When the Gateway first reports disableCrossRegionalHedging=true and then a later + /// account-refresh response omits the property entirely (HasValue == false), the change + /// event MUST NOT fire — an absent property is "no signal", not an implicit transition to + /// false. Re-enabling hedging silently on a property drop would defeat the entire purpose + /// of the kill-switch during the exact window the operator most wants it disabled + /// (transient gateway response shapes, partial regional failovers, stale gateway versions). + /// + /// The runbook contract is "to disable, set the property to explicit false" — never by + /// removing it. + /// + [TestMethod] + public async Task RefreshDatabaseAccount_DisableCrossRegionalHedging_PropertyDroppedAfterTrue_DoesNotFireChangeEvent() + { + // Arrange — initial AccountProperties with disableCrossRegionalHedging=true. + AccountProperties initialAccount = new AccountProperties + { + ReadLocationsInternal = new Collection + { + new AccountRegion { Name = "Region0", Endpoint = "https://region0.documents.azure.com/" } + }, + DisableCrossRegionalHedging = true + }; + + // Refresh response drops the property entirely (HasValue == false). Read locations are + // preserved so the refresh path itself completes successfully. + AccountProperties refreshAccount = new AccountProperties + { + ReadLocationsInternal = new Collection + { + new AccountRegion { Name = "Region0", Endpoint = "https://region0.documents.azure.com/" } + } + // DisableCrossRegionalHedging intentionally NOT set (null). + }; + + Mock mockOwner = new Mock(); + mockOwner.Setup(owner => owner.ServiceEndpoint).Returns(new Uri("https://defaultendpoint.net/")); + mockOwner.Setup(owner => owner.GetDatabaseAccountInternalAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(refreshAccount); + + ConnectionPolicy connectionPolicy = new ConnectionPolicy(); + + using GlobalEndpointManager gem = new GlobalEndpointManager(mockOwner.Object, connectionPolicy); + + // Seed the baseline: lastKnownDisableCrossRegionalHedging = true. + gem.InitializeAccountPropertiesAndStartBackgroundRefresh(initialAccount); + + int changeEventInvocations = 0; + gem.OnEnablePartitionLevelFailoverConfigChanged += (_, _) => Interlocked.Increment(ref changeEventInvocations); + + // Act — force a refresh that observes the property-dropped response. + await gem.RefreshLocationAsync(forceRefresh: true); + + // Assert — change event must NOT fire on a true -> null transition. + Assert.AreEqual( + 0, + changeEventInvocations, + "Dropping the disableCrossRegionalHedging property from the gateway response must not be treated as a transition to false. The kill-switch is only released by an explicit 'false' value."); + } + + /// + /// Positive companion to : + /// when the gateway emits an explicit false after a prior true, the change + /// event MUST fire so the SDK can restore hedging. This validates the .HasValue + /// guard is gating only absent values, not explicit ones. + /// + [TestMethod] + public async Task RefreshDatabaseAccount_DisableCrossRegionalHedging_ExplicitFalseAfterTrue_FiresChangeEvent() + { + AccountProperties initialAccount = new AccountProperties + { + ReadLocationsInternal = new Collection + { + new AccountRegion { Name = "Region0", Endpoint = "https://region0.documents.azure.com/" } + }, + DisableCrossRegionalHedging = true + }; + + AccountProperties refreshAccount = new AccountProperties + { + ReadLocationsInternal = new Collection + { + new AccountRegion { Name = "Region0", Endpoint = "https://region0.documents.azure.com/" } + }, + DisableCrossRegionalHedging = false + }; + + Mock mockOwner = new Mock(); + mockOwner.Setup(owner => owner.ServiceEndpoint).Returns(new Uri("https://defaultendpoint.net/")); + mockOwner.Setup(owner => owner.GetDatabaseAccountInternalAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(refreshAccount); + + ConnectionPolicy connectionPolicy = new ConnectionPolicy(); + + using GlobalEndpointManager gem = new GlobalEndpointManager(mockOwner.Object, connectionPolicy); + gem.InitializeAccountPropertiesAndStartBackgroundRefresh(initialAccount); + + int changeEventInvocations = 0; + bool? observedDisableValue = null; + gem.OnEnablePartitionLevelFailoverConfigChanged += (_, disableHedging) => + { + Interlocked.Increment(ref changeEventInvocations); + observedDisableValue = disableHedging; + }; + + await gem.RefreshLocationAsync(forceRefresh: true); + + Assert.AreEqual( + 1, + changeEventInvocations, + "Explicit transition from disableCrossRegionalHedging=true to false must fire the change event exactly once."); + Assert.IsTrue(observedDisableValue.HasValue && observedDisableValue.Value == false, + "Change event must propagate the new explicit false value."); + } + private sealed class GetAccountRequestInjector { public Func ShouldFailRequest { get; set; } diff --git a/changelog.md b/changelog.md index 9761217a21..6c16868a2c 100644 --- a/changelog.md +++ b/changelog.md @@ -19,6 +19,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 #### Features Added +- [5829](https://github.com/Azure/azure-cosmos-dotnet-v3/pull/5829) Routing: Adds Gateway Account Property Flag for Dynamic Hedging Control. Introduces a Gateway-controlled `disableCrossRegionalHedging` account property that lets operators dynamically disable PPAF cross-region hedging (and any customer-configured `AvailabilityStrategy`) without rolling back PPAF entirely. The SDK reconciles `ConnectionPolicy.AvailabilityStrategy` against the Gateway-supplied flag on each account-properties refresh; toggling the flag back to `false` restores the customer-configured strategy or rebuilds the SDK default. + #### Breaking Changes #### Bugs Fixed diff --git a/openspec/changes/ppaf-dynamic-hedging-control/.openspec.yaml b/openspec/changes/ppaf-dynamic-hedging-control/.openspec.yaml new file mode 100644 index 0000000000..2ca4bc8517 --- /dev/null +++ b/openspec/changes/ppaf-dynamic-hedging-control/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-03-24 diff --git a/openspec/changes/ppaf-dynamic-hedging-control/design.md b/openspec/changes/ppaf-dynamic-hedging-control/design.md new file mode 100644 index 0000000000..deeedbb412 --- /dev/null +++ b/openspec/changes/ppaf-dynamic-hedging-control/design.md @@ -0,0 +1,87 @@ +## Context + +PPAF (Partition-Level Failover) is an account-level feature that, when enabled, triggers automatic hedging of read requests with a default threshold of `Min(1000ms, RequestTimeout/2)` and a step of 500ms. The hedging is implemented via `CrossRegionHedgingAvailabilityStrategy`, which is either explicitly configured by the customer on `CosmosClientOptions.AvailabilityStrategy` or automatically created by the SDK in `DocumentClient.InitializePartitionLevelFailoverWithDefaultHedging()`. + +**Current hedging decision flow:** + +1. `RequestInvokerHandler` resolves the active `AvailabilityStrategy`: request-level override → client-level → null. +2. If a strategy is present and `Enabled()`, requests are dispatched through `CrossRegionHedgingAvailabilityStrategy.ExecuteAvailabilityStrategyAsync()`. +3. The strategy decides per-request whether to hedge based on resource type (Document only), operation type (reads always, writes only if multi-write enabled), and available regions. + +**Account properties refresh:** + +- `GlobalEndpointManager.RefreshDatabaseAccountInternalAsync()` periodically fetches `AccountProperties` from the Gateway. +- The `OnEnablePartitionLevelFailoverConfigChanged` event fires when PPAF status changes, triggering `DocumentClient.UpdatePartitionLevelFailoverConfigWithAccountRefresh()` to dynamically enable or disable default hedging. + +**Problem:** There is no mechanism for on-call engineers to temporarily disable hedging for a PPAF account without rolling back PPAF entirely. Rolling back PPAF is expensive and disrupts other benefits. A Gateway-controlled flag is needed as a targeted escape hatch. + +## Goals / Non-Goals + +**Goals:** + +- Add a new boolean account property (`disableCrossRegionalHedging`) read from Gateway responses. +- When the flag is `true`, disable all hedging (both SDK-default PPAF hedging and explicit customer-configured hedging) for that account. +- When the flag is `false` or absent, preserve existing hedging behavior. +- Support dynamic toggling: as clients observe refreshed account properties, hedging state updates accordingly. +- Keep the feature entirely internal — no new public API surface. + +**Non-Goals:** + +- Changing customer-authored hedging strategies or their configuration shape. +- Modifying PPAF enablement or onboarding flows. +- Exposing the flag to end users or making it configurable from the SDK. + +## Decisions + +### 1. Property location: `AccountProperties` with `JsonExtensionData` fallback + +**Decision:** Add a strongly-typed `bool?` property `DisableCrossRegionalHedging` to `AccountProperties` with a `[JsonProperty]` attribute mapped to the Gateway JSON key `"disableCrossRegionalHedging"`. + +**Rationale:** This is consistent with how `EnablePartitionLevelFailover` is already modeled (Line 249 of `AccountProperties.cs`). A strongly-typed property provides compile-time safety and discoverability. The `[JsonExtensionData] AdditionalProperties` dictionary exists as a fallback for unknown fields, but relying on it would lose type safety and require manual parsing. + +**Alternative considered:** Reading from `AdditionalProperties` at evaluation time. Rejected because it introduces fragile string-keyed lookups and inconsistency with the existing pattern for account-level flags. + +### 2. Evaluation point: `GlobalEndpointManager` account-refresh callback + +**Decision:** Evaluate the `disableCrossRegionalHedging` flag in `DocumentClient.UpdatePartitionLevelFailoverConfigWithAccountRefresh()` — the same method that already handles dynamic PPAF enable/disable based on account-property changes. + +**Rationale:** This method is invoked whenever `GlobalEndpointManager` detects a change in PPAF-related account properties. Adding the hedging-disable check here ensures the flag is evaluated on every account-properties refresh, supporting dynamic toggling. It also consolidates all PPAF-related hedging logic in one place. + +**Alternative considered:** Evaluating the flag per-request in `RequestInvokerHandler`. Rejected because it would require propagating account properties into the hot path and adds unnecessary per-request overhead. The account-refresh callback runs infrequently and already handles strategy assignment. + +### 3. Enforcement mechanism: Strategy nullification / replacement + +**Decision:** When the flag is `true`, set `ConnectionPolicy.AvailabilityStrategy` to `null` (or a `DisabledAvailabilityStrategy` sentinel) to disable hedging. When the flag is toggled back to `false`, re-evaluate and restore the appropriate strategy (explicit customer config or PPAF default). + +**Rationale:** `RequestInvokerHandler` already treats a null strategy as "no hedging" (Line 97-98: `strategy != null && strategy.Enabled()`). Using the existing null-check path avoids introducing new conditional logic in the request hot path. The `DisabledAvailabilityStrategy` subclass already exists for explicit opt-out scenarios, though null assignment is simpler. + +**Alternative considered:** Adding an `IsDisabledByGateway` flag to `CrossRegionHedgingAvailabilityStrategy` and checking it in `Enabled()`. Rejected because it would couple the strategy to Gateway concerns and requires changes to the strategy hierarchy. + +### 4. Precedence: Gateway flag overrides all hedging configuration + +**Decision:** When the Gateway flag is `true`, it overrides both SDK-default PPAF hedging AND any explicit customer-configured `AvailabilityStrategy`. The precedence order becomes: + +1. Gateway `disableCrossRegionalHedging = true` → hedging OFF (highest priority) +2. Request-level `AvailabilityStrategy` override +3. Client-level `AvailabilityStrategy` (explicit customer config) +4. PPAF default hedging (if PPAF enabled, no explicit config) + +**Rationale:** The flag is an operational safety valve. If on-call determines hedging is causing issues, it must be a hard kill switch regardless of what the customer configured. This prevents scenarios where explicit customer hedging bypasses the mitigation. + +### 5. State tracking: Store flag and original strategy reference + +**Decision:** Introduce internal fields in `DocumentClient`: +- `bool disableCrossRegionalHedgingFlag` — cached value of the Gateway flag. +- `AvailabilityStrategy customerConfiguredStrategy` — the customer's original explicit strategy (if any), stored before nullification so it can be restored when the flag is toggled back. + +**Rationale:** The SDK must be able to restore the correct hedging behavior when the flag is turned off. Without storing the original strategy, the SDK cannot distinguish between "customer never set a strategy" and "strategy was removed by the flag." + +## Risks / Trade-offs + +- **[Risk] Flag latency** — The flag takes effect only after the next account-properties refresh (default interval is ~5 minutes). → **Mitigation:** On-call can force a faster refresh by restarting the client or waiting for the next refresh cycle. The existing `GlobalEndpointManager` refresh interval is sufficient for non-emergency toggles. + +- **[Risk] Customer confusion if explicit hedging silently disabled** — If a customer configured explicit hedging and on-call disables it via the flag, the customer may observe unexpected behavior. → **Mitigation:** The flag is intended as a temporary operational tool. On-call should communicate with the customer. SDK diagnostics/traces should log when hedging is disabled by the Gateway flag. + +- **[Risk] Strategy restoration correctness** — When restoring hedging after the flag is toggled off, the SDK must correctly reconstruct the PPAF default strategy or restore the customer's explicit strategy. → **Mitigation:** Store the original strategy reference before nullification. Unit-test the toggle cycle (enable → disable → re-enable). + +- **[Trade-off] Customer client-level opt-out bypasses the flag** — Customers who set `CosmosClientOptions.DisablePartitionLevelFailover` (surfaced internally as `DisablePartitionLevelFailoverClientLevelOverride`) opt out of all PPAF-related machinery including this Gateway override. This is consistent with the rest of the PPAF control plane and avoids surprising customers who have explicitly disabled the feature. diff --git a/openspec/changes/ppaf-dynamic-hedging-control/proposal.md b/openspec/changes/ppaf-dynamic-hedging-control/proposal.md new file mode 100644 index 0000000000..f48f1481b7 --- /dev/null +++ b/openspec/changes/ppaf-dynamic-hedging-control/proposal.md @@ -0,0 +1,35 @@ +## Why + +PPAF-enabled Cosmos DB accounts automatically enable hedging with a 1-second threshold to fast-track read-region failover. However, production incidents have shown that implicit hedging of long-running queries can cause unexpected exceptions (e.g., ArgumentException in CallStore). Rolling back PPAF entirely to disable hedging is operationally expensive and disrupts other PPAF benefits, so a targeted, service-side escape hatch is needed to let on-call engineers dynamically disable hedging without customer intervention. + +## What Changes + +- Introduce a new Gateway account property (`disableCrossRegionalHedging`) that the SDK reads from account-property responses. +- When the flag is `true`, hedging is disabled for the PPAF account regardless of any explicit or implicit hedging configuration. +- When the flag is `false` or absent, existing hedging behavior is preserved (explicit customer config honored; PPAF defaults applied if no explicit config). +- The SDK evaluates the flag dynamically on every account-properties refresh, enabling on-call toggle without customer code changes. +- The flag is honored for any account where the customer has not opted out via `CosmosClientOptions.DisablePartitionLevelFailover` (surfaced internally as `DisablePartitionLevelFailoverClientLevelOverride`). + +## Capabilities + +### New Capabilities +- `gateway-hedging-override`: Reads a new Gateway account property flag and enforces it as the highest-precedence control over PPAF hedging behavior, supporting dynamic enable/disable at the SDK layer. + +### Modified Capabilities + + +## Impact + +- **SDK Client layer** (`DocumentClient` / `CosmosClient` internals): hedging-decision logic must incorporate a new precedence check against the Gateway flag before evaluating explicit or default hedging configuration. +- **Account properties model**: new property deserialized from the Gateway response (`AccountProperties` or equivalent DTO). +- **Gateway / service dependency**: the flag is surfaced by the Cosmos DB Gateway; the SDK consumes it read-only. +- **No public API surface changes**: the feature is invisible to end users; no new `CosmosClientOptions` or request-options properties are exposed. +- **Testing**: unit tests for precedence rules; integration tests validating dynamic toggle via mocked account-property responses. + +## Cross-SDK Parity + +- **.NET (this PR).** Implements the gateway-driven `disableCrossRegionalHedging` flag in the `Microsoft.Azure.Cosmos` v3 SDK. +- **Java.** As of the date of this PR, `azure-sdk-for-java` does not surface this flag. Java already has the GEM `perPartitionAutomaticFailoverConfigModifier` callback plumbing (`GlobalEndpointManager.java`), so mirroring this knob is additive rather than new infrastructure. A tracking issue should be filed against `Azure/azure-sdk-for-java` so an operator flipping the gateway flag gets consistent behavior across .NET and Java clients on the same account. +- **Python.** Out of scope. Python's `azure-cosmos` SDK has no SDK-default PPAF hedging today (default `availability_strategy=False`), so the per-request override path that motivates this knob does not exist. + diff --git a/openspec/changes/ppaf-dynamic-hedging-control/specs/gateway-hedging-override/spec.md b/openspec/changes/ppaf-dynamic-hedging-control/specs/gateway-hedging-override/spec.md new file mode 100644 index 0000000000..059f063d7d --- /dev/null +++ b/openspec/changes/ppaf-dynamic-hedging-control/specs/gateway-hedging-override/spec.md @@ -0,0 +1,142 @@ +## ADDED Requirements + +### Requirement: Gateway account property for hedging control +The `AccountProperties` model SHALL include a nullable boolean property `DisableCrossRegionalHedging` deserialized from the Gateway JSON key `"disableCrossRegionalHedging"`. The property SHALL default to `null` when absent from the Gateway response. + +#### Scenario: Gateway response includes the flag set to true +- **WHEN** the Gateway account-properties response contains `"disableCrossRegionalHedging": true` +- **THEN** the `AccountProperties.DisableCrossRegionalHedging` property SHALL be `true` + +#### Scenario: Gateway response includes the flag set to false +- **WHEN** the Gateway account-properties response contains `"disableCrossRegionalHedging": false` +- **THEN** the `AccountProperties.DisableCrossRegionalHedging` property SHALL be `false` + +#### Scenario: Gateway response does not include the flag +- **WHEN** the Gateway account-properties response does not contain the `"disableCrossRegionalHedging"` key +- **THEN** the `AccountProperties.DisableCrossRegionalHedging` property SHALL be `null` + +--- + +### Requirement: Gateway flag disables all hedging when true +When the Gateway flag `disableCrossRegionalHedging` is `true`, the SDK SHALL disable all hedging for PPAF-enabled accounts regardless of any explicit or implicit hedging configuration. + +#### Scenario: PPAF account with default hedging and flag set to true +- **WHEN** the account has PPAF enabled with SDK-default hedging active +- **AND** the Gateway flag `disableCrossRegionalHedging` is `true` +- **THEN** the SDK SHALL disable hedging +- **AND** requests SHALL NOT be hedged across regions + +#### Scenario: PPAF account with explicit customer hedging and flag set to true +- **WHEN** the account has PPAF enabled +- **AND** the customer has configured an explicit `AvailabilityStrategy` via `CosmosClientOptions` +- **AND** the Gateway flag `disableCrossRegionalHedging` is `true` +- **THEN** the SDK SHALL disable hedging +- **AND** the explicit customer strategy SHALL NOT be executed + +#### Scenario: PPAF account with request-level hedging override and flag set to true +- **WHEN** the account has PPAF enabled +- **AND** a request has a per-request `AvailabilityStrategy` override set in `RequestOptions` +- **AND** the Gateway flag `disableCrossRegionalHedging` is `true` +- **THEN** the SDK SHALL disable hedging for that request +- **AND** the request-level strategy SHALL NOT be executed + +--- + +### Requirement: Existing behavior preserved when flag is false or absent +When the Gateway flag `disableCrossRegionalHedging` is `false` or absent from the account-properties response, the SDK SHALL preserve existing hedging behavior without any change. + +#### Scenario: PPAF account with flag set to false and no explicit hedging +- **WHEN** the account has PPAF enabled +- **AND** the Gateway flag `disableCrossRegionalHedging` is `false` +- **AND** no explicit customer hedging configuration is set +- **THEN** the SDK SHALL enable the default PPAF hedging strategy with threshold `Min(1000ms, RequestTimeout/2)` and step `500ms` + +#### Scenario: PPAF account with flag absent and explicit hedging configured +- **WHEN** the account has PPAF enabled +- **AND** the Gateway flag `disableCrossRegionalHedging` is absent from the response +- **AND** the customer has configured an explicit `AvailabilityStrategy` +- **THEN** the SDK SHALL honor the customer's explicit hedging configuration + +#### Scenario: PPAF account with flag set to false and explicit hedging configured +- **WHEN** the account has PPAF enabled +- **AND** the Gateway flag `disableCrossRegionalHedging` is `false` +- **AND** the customer has configured an explicit `AvailabilityStrategy` +- **THEN** the SDK SHALL honor the customer's explicit hedging configuration + +--- + +### Requirement: Dynamic toggling via account-properties refresh +The SDK SHALL evaluate the Gateway flag on each account-properties refresh cycle and dynamically enable or disable hedging as the flag value changes, without requiring client restart. + +#### Scenario: Flag toggled from false to true during runtime +- **WHEN** the Gateway flag `disableCrossRegionalHedging` was `false` (or absent) at client initialization +- **AND** hedging was active (default or explicit) +- **AND** the Gateway flag is changed to `true` +- **AND** the SDK observes the updated account properties via the next refresh cycle +- **THEN** the SDK SHALL disable hedging + +#### Scenario: Flag toggled from true to false during runtime +- **WHEN** the Gateway flag `disableCrossRegionalHedging` was `true` and hedging was disabled +- **AND** the Gateway flag is changed to `false` +- **AND** the SDK observes the updated account properties via the next refresh cycle +- **THEN** the SDK SHALL re-enable hedging using the appropriate strategy +- **AND** if the customer had configured an explicit strategy, that strategy SHALL be restored +- **AND** if no explicit strategy was configured, the SDK-default PPAF hedging strategy SHALL be applied + +#### Scenario: Flag toggled from true to false with no prior explicit strategy +- **WHEN** the Gateway flag `disableCrossRegionalHedging` transitions from `true` to `false` +- **AND** the customer did not configure an explicit `AvailabilityStrategy` +- **AND** the account has PPAF enabled +- **THEN** the SDK SHALL re-enable the default PPAF hedging strategy + +--- + +### Requirement: Customer client-level opt-out bypasses the flag +When a customer has set `CosmosClientOptions.DisablePartitionLevelFailover` (surfaced internally as `DisablePartitionLevelFailoverClientLevelOverride`), the SDK SHALL NOT evaluate or act on the `disableCrossRegionalHedging` flag. The customer's local configuration takes precedence over the Gateway-driven operator override. Outside that customer opt-out the flag is honored for both PPAF and non-PPAF accounts. + +#### Scenario: Customer opt-out with flag set to true +- **WHEN** the customer has set `CosmosClientOptions.DisablePartitionLevelFailover = true` +- **AND** the Gateway flag `disableCrossRegionalHedging` is `true` +- **THEN** the SDK SHALL ignore the Gateway flag +- **AND** any explicit customer hedging configuration SHALL continue to function normally + +#### Scenario: Non-PPAF account without opt-out with flag set to true +- **WHEN** the account does NOT have PPAF enabled (`EnablePartitionLevelFailover` is `false` or absent) +- **AND** the customer has NOT set `CosmosClientOptions.DisablePartitionLevelFailover` +- **AND** the Gateway flag `disableCrossRegionalHedging` is `true` +- **THEN** the SDK SHALL honor the Gateway flag +- **AND** SHALL disable hedging if the customer had configured an explicit `AvailabilityStrategy` + +--- + +### Requirement: Feature is invisible to end users +The Gateway hedging override flag SHALL NOT be exposed through any public SDK API surface. There SHALL be no new public properties on `CosmosClientOptions`, `RequestOptions`, or any other user-facing type related to this flag. + +#### Scenario: No public API surface for the flag +- **WHEN** a developer inspects the public API of `CosmosClientOptions`, `ItemRequestOptions`, `QueryRequestOptions`, or `ChangeFeedRequestOptions` +- **THEN** there SHALL be no property or method related to `disableCrossRegionalHedging` + +#### Scenario: Diagnostics logging when hedging is disabled by flag +- **WHEN** hedging is disabled due to the Gateway flag being `true` +- **THEN** the SDK SHALL include a trace or diagnostic entry indicating that hedging was disabled by a Gateway account property +- **AND** this information SHALL be available in SDK diagnostics for supportability + +--- + +### Requirement: Precedence rules for hedging evaluation +The SDK SHALL evaluate hedging configuration using the following strict precedence order: +1. Gateway `disableCrossRegionalHedging = true` → hedging OFF (highest priority) +2. If Gateway flag is `false` or absent → evaluate existing rules (request-level override → client-level strategy → PPAF default) + +#### Scenario: Gateway flag true takes precedence over all other configuration +- **WHEN** the Gateway flag `disableCrossRegionalHedging` is `true` +- **AND** the customer has configured an explicit `AvailabilityStrategy` at the client level +- **AND** a request has a per-request `AvailabilityStrategy` override +- **THEN** the SDK SHALL disable hedging for that request +- **AND** neither the client-level nor request-level strategy SHALL be executed + +#### Scenario: Gateway flag false defers to existing precedence +- **WHEN** the Gateway flag `disableCrossRegionalHedging` is `false` +- **AND** the customer has configured an explicit `AvailabilityStrategy` at the client level +- **AND** a request has a per-request `AvailabilityStrategy` override +- **THEN** the request-level strategy SHALL be used (existing precedence preserved) diff --git a/openspec/changes/ppaf-dynamic-hedging-control/tasks.md b/openspec/changes/ppaf-dynamic-hedging-control/tasks.md new file mode 100644 index 0000000000..d7649ee873 --- /dev/null +++ b/openspec/changes/ppaf-dynamic-hedging-control/tasks.md @@ -0,0 +1,38 @@ +## 1. Account Properties Model + +- [ ] 1.1 Add `DisableCrossRegionalHedging` nullable bool property to `AccountProperties.cs` with `[JsonProperty("disableCrossRegionalHedging")]` attribute, following the same pattern as `EnablePartitionLevelFailover` +- [ ] 1.2 Add unit tests for `AccountProperties` deserialization: flag present as `true`, flag present as `false`, and flag absent from JSON + +## 2. DocumentClient State Tracking + +- [ ] 2.1 Add internal field `disableCrossRegionalHedgingFlag` (bool, default false) to `DocumentClient` to cache the current Gateway flag value +- [ ] 2.2 Add internal field to store the customer's original explicit `AvailabilityStrategy` reference (if any) so it can be restored when the flag is toggled back to false + +## 3. Hedging Evaluation in Account-Refresh Callback + +- [ ] 3.1 Modify `GlobalEndpointManager.RefreshDatabaseAccountInternalAsync()` (or the existing PPAF-change callback) to propagate the `DisableCrossRegionalHedging` value to `DocumentClient` +- [ ] 3.2 Modify `DocumentClient.UpdatePartitionLevelFailoverConfigWithAccountRefresh()` to evaluate the Gateway flag: when `true`, store the current strategy and set `ConnectionPolicy.AvailabilityStrategy` to null/disabled; when `false` or absent, restore the appropriate strategy +- [ ] 3.3 Ensure the flag is also evaluated during initial client setup in `DocumentClient.InitializePartitionLevelFailoverWithDefaultHedging()` — if the flag is `true` at initialization time, do not enable default hedging + +## 4. Request-Level Enforcement + +- [x] 4.1 Update `RequestInvokerHandler` hedging resolution logic to check the Gateway disable flag before evaluating request-level or client-level `AvailabilityStrategy` overrides, ensuring the flag takes absolute precedence when `true` + +## 5. Diagnostics and Tracing + +- [ ] 5.1 Add a trace/diagnostic log entry when hedging is disabled due to the Gateway flag, including the flag value and the action taken (e.g., "Hedging disabled by Gateway account property disableCrossRegionalHedging=true") +- [ ] 5.2 Add a trace/diagnostic log entry when hedging is re-enabled after the flag is toggled back to false + +## 6. Unit Tests + +- [ ] 6.1 Test: PPAF account with default hedging — flag `true` disables hedging, flag toggled to `false` re-enables default hedging +- [ ] 6.2 Test: PPAF account with explicit customer hedging — flag `true` disables hedging, flag toggled to `false` restores customer strategy +- [x] 6.3 Test: PPAF account with request-level hedging override — flag `true` prevents request-level strategy execution +- [ ] 6.4 Test: Account with `DisablePartitionLevelFailoverClientLevelOverride=true` — flag `true` does not affect hedging (verifies customer client-level opt-out gate) +- [ ] 6.5 Test: Flag absent from account properties — existing behavior unchanged +- [ ] 6.6 Test: Dynamic toggle cycle — enable → disable → re-enable with correct strategy restoration + +## 7. Integration Verification + +- [ ] 7.1 Validate end-to-end with mocked Gateway responses containing the `disableCrossRegionalHedging` flag in integration/emulator tests +- [ ] 7.2 Verify no public API surface changes — confirm `DisableCrossRegionalHedging` is internal only and not exposed on `CosmosClientOptions`, `RequestOptions`, or related types