From 5044b6f536ac8c6009dbf6f76ece51989287775b Mon Sep 17 00:00:00 2001 From: Nalu Tripician <27316859+NaluTripician@users.noreply.github.com> Date: Tue, 5 May 2026 09:22:25 -0700 Subject: [PATCH 1/6] Routing: Adds Gateway Account Property Flag for Dynamic Hedging Control Adds a Gateway-controlled `disableCrossRegionalHedging` account property that lets on-call engineers dynamically disable PPAF cross-region hedging without rolling back PPAF entirely. When the flag is true, the SDK clears the AvailabilityStrategy on the next account-properties refresh; toggling it back to false restores the customer-configured strategy or the SDK default. Changes - AccountProperties: new internal `DisableCrossRegionalHedging` (nullable bool) bound to the JSON key `disableCrossRegionalHedging`. - GlobalEndpointManager: extends `OnEnablePartitionLevelFailoverConfigChanged` to `Action` and tracks `lastKnownDisableCrossRegionalHedging` so the event fires when either flag changes. - DocumentClient: - Captures the initial gateway flag during `GetInitializationTaskAsync` and stashes any customer-configured `AvailabilityStrategy` if the flag is true at startup. - `InitializePartitionLevelFailoverWithDefaultHedging` short-circuits when the flag is true. - `UpdatePartitionLevelFailoverConfigWithAccountRefresh` now reconciles both PPAF state and the disable flag via `ApplyHedgingStrategyForCurrentState`, which restores customer strategies when the flag flips back off and skips stashing SDK-default strategies (regenerated deterministically). - Tests: 10 new unit tests covering deserialization, init-time skip, customer- strategy stash/restore, SDK-default rebuild, and client-level override. - Updates 2 existing serializer contract tests that pin the `AccountProperties` JSON shape. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .gitignore | 3 + Microsoft.Azure.Cosmos/src/DocumentClient.cs | 148 +++++++++-- .../Resource/Settings/AccountProperties.cs | 14 ++ .../src/Routing/GlobalEndpointManager.cs | 60 +++-- .../CosmosJsonSerializerUnitTests.cs | 2 +- .../CosmosSerializerCoreTests.cs | 2 +- .../GatewayHedgingOverrideTests.cs | 233 ++++++++++++++++++ .../.openspec.yaml | 2 + .../ppaf-dynamic-hedging-control/design.md | 88 +++++++ .../ppaf-dynamic-hedging-control/proposal.md | 28 +++ .../specs/gateway-hedging-override/spec.md | 141 +++++++++++ .../ppaf-dynamic-hedging-control/tasks.md | 38 +++ 12 files changed, 720 insertions(+), 39 deletions(-) create mode 100644 Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GatewayHedgingOverrideTests.cs create mode 100644 openspec/changes/ppaf-dynamic-hedging-control/.openspec.yaml create mode 100644 openspec/changes/ppaf-dynamic-hedging-control/design.md create mode 100644 openspec/changes/ppaf-dynamic-hedging-control/proposal.md create mode 100644 openspec/changes/ppaf-dynamic-hedging-control/specs/gateway-hedging-override/spec.md create mode 100644 openspec/changes/ppaf-dynamic-hedging-control/tasks.md diff --git a/.gitignore b/.gitignore index a02d350b97..faa8cc41a9 100644 --- a/.gitignore +++ b/.gitignore @@ -331,3 +331,6 @@ ASALocalRun/ # Visual Studio Code files .vscode/ + +.coding-harness/ + diff --git a/Microsoft.Azure.Cosmos/src/DocumentClient.cs b/Microsoft.Azure.Cosmos/src/DocumentClient.cs index c70b0b8668..9fd3ddbd31 100644 --- a/Microsoft.Azure.Cosmos/src/DocumentClient.cs +++ b/Microsoft.Azure.Cosmos/src/DocumentClient.cs @@ -174,6 +174,15 @@ internal partial class DocumentClient : IDisposable, IAuthorizationTokenProvider private bool isDisposing; private bool isDisposed; + // Gateway-controlled override that disables all cross-regional hedging for PPAF accounts. + // Mirrors AccountProperties.DisableCrossRegionalHedging from the most recent account-properties refresh. + private bool disableCrossRegionalHedging; + + // When the gateway disable flag is true, the customer's explicit AvailabilityStrategy + // (if any) is stashed here so it can be restored verbatim if the flag is later toggled back to false. + // Null when the customer never configured a strategy or when no stash is currently held. + private AvailabilityStrategy customerConfiguredAvailabilityStrategy; + // creator of TransportClient is responsible for disposing it. private IStoreClientFactory storeClientFactory; internal CosmosHttpClient httpClient { get; private set; } @@ -1063,6 +1072,21 @@ private async Task GetInitializationTaskAsync(IStoreClientFactory storeCli this.ConnectionPolicy.EnablePartitionLevelFailover = this.accountServiceConfiguration.AccountProperties.EnablePartitionLevelFailover.Value; } + // Capture the initial gateway disableCrossRegionalHedging flag and stash any customer-configured + // AvailabilityStrategy so it can be restored if the flag is later toggled back to false. + if (!this.ConnectionPolicy.DisablePartitionLevelFailoverClientLevelOverride + && this.accountServiceConfiguration?.AccountProperties != null) + { + this.disableCrossRegionalHedging = this.accountServiceConfiguration.AccountProperties.DisableCrossRegionalHedging ?? false; + if (this.disableCrossRegionalHedging && this.ConnectionPolicy.AvailabilityStrategy != null) + { + this.customerConfiguredAvailabilityStrategy = this.ConnectionPolicy.AvailabilityStrategy; + this.ConnectionPolicy.AvailabilityStrategy = null; + DefaultTrace.TraceInformation( + "DocumentClient: Hedging disabled at initialization by Gateway property disableCrossRegionalHedging=true"); + } + } + this.isThinClientEnabled = this.isThinClientFeatureFlagEnabled && (this.ConnectionPolicy.ConnectionMode == ConnectionMode.Gateway) && (this.accountServiceConfiguration.AccountProperties?.ThinClientWritableLocationsInternal?.Count ?? 0) > 0; this.ConnectionPolicy.EnablePartitionLevelCircuitBreaker |= this.ConnectionPolicy.EnablePartitionLevelFailover; @@ -6900,6 +6924,13 @@ internal string GetUserAgentFeatures() internal void InitializePartitionLevelFailoverWithDefaultHedging() { + if (this.disableCrossRegionalHedging) + { + DefaultTrace.TraceInformation( + "DocumentClient: Skipping default PPAF hedging because Gateway property disableCrossRegionalHedging=true"); + return; + } + if (this.ConnectionPolicy.EnablePartitionLevelFailover && this.ConnectionPolicy.AvailabilityStrategy == null) { @@ -6926,8 +6957,9 @@ internal void InitializePartitionLevelFailoverWithDefaultHedging() } } - private void UpdatePartitionLevelFailoverConfigWithAccountRefresh( - bool isEnabled) + internal void UpdatePartitionLevelFailoverConfigWithAccountRefresh( + bool isEnabled, + bool disableCrossRegionalHedging) { // Only update if client-level override is not disabled if (this.ConnectionPolicy.DisablePartitionLevelFailoverClientLevelOverride) @@ -6936,36 +6968,112 @@ private void UpdatePartitionLevelFailoverConfigWithAccountRefresh( return; } - DefaultTrace.TraceInformation( - "DocumentClient: PPAF Account Level Config Updated. Updating EnablePartitionLevelFailover to {0}", - isEnabled); + bool ppafEnablementChanged = this.ConnectionPolicy.EnablePartitionLevelFailover != isEnabled; + bool hedgingFlagChanged = this.disableCrossRegionalHedging != disableCrossRegionalHedging; - // Step 1: Enable partition level failover. - this.PartitionKeyRangeLocation.SetIsPPAFEnabled(isEnabled); - this.ConnectionPolicy.EnablePartitionLevelFailover = isEnabled; + if (ppafEnablementChanged) + { + DefaultTrace.TraceInformation( + "DocumentClient: PPAF Account Level Config Updated. Updating EnablePartitionLevelFailover to {0}", + isEnabled); + + // Step 1: Enable partition level failover. + this.PartitionKeyRangeLocation.SetIsPPAFEnabled(isEnabled); + this.ConnectionPolicy.EnablePartitionLevelFailover = isEnabled; - // Step 2: Enable partition level circuit breaker. - this.PartitionKeyRangeLocation.SetIsPPCBEnabled(isEnabled); - this.ConnectionPolicy.EnablePartitionLevelCircuitBreaker = isEnabled; + // Step 2: Enable partition level circuit breaker. + this.PartitionKeyRangeLocation.SetIsPPCBEnabled(isEnabled); + this.ConnectionPolicy.EnablePartitionLevelCircuitBreaker = isEnabled; + } - // Step 3: Enable default hedging strategy if partition level failover is enabled. - if (isEnabled && this.ConnectionPolicy.AvailabilityStrategy == null) + if (hedgingFlagChanged) { - this.InitializePartitionLevelFailoverWithDefaultHedging(); + DefaultTrace.TraceInformation( + "DocumentClient: Gateway disableCrossRegionalHedging flag changed to {0}", + disableCrossRegionalHedging); + this.disableCrossRegionalHedging = disableCrossRegionalHedging; } - else + + // Step 3: Reconcile the AvailabilityStrategy with the latest account state. The gateway + // disable flag has the highest precedence — when true, hedging is OFF regardless of any + // explicit or default configuration. When false, restore the customer's explicit strategy + // (if previously stashed) or apply the SDK default for PPAF. + this.ApplyHedgingStrategyForCurrentState(); + + if (ppafEnablementChanged) + { + // Step 4: Update the user agent features. Hedging-flag-only changes do not affect the + // PPAF-related user-agent feature flags. + this.ConnectionPolicy.UserAgentContainer.AppendFeatures(this.GetUserAgentFeatures()); + } + + DefaultTrace.TraceInformation("DocumentClient: Successfully updated PPAF configuration dynamically"); + } + + /// + /// Reconciles with the current values of + /// and . + /// + /// + /// Precedence (highest first): + /// 1. Gateway disableCrossRegionalHedging = true: stash any non-default strategy and clear it. + /// 2. Customer explicitly configured a strategy: keep / restore that strategy. + /// 3. PPAF enabled with no explicit strategy: apply SDK default PPAF hedging. + /// 4. PPAF disabled: clear any SDK-default strategy that we previously installed. + /// + private void ApplyHedgingStrategyForCurrentState() + { + if (this.disableCrossRegionalHedging) { - if (this.ConnectionPolicy.AvailabilityStrategy is CrossRegionHedgingAvailabilityStrategy strategy && strategy.IsSDKDefaultStrategyForPPAF) + AvailabilityStrategy currentStrategy = this.ConnectionPolicy.AvailabilityStrategy; + if (currentStrategy != null) { - // If the user has not set a custom availability strategy, then we will reset it to null. + bool currentIsSdkDefault = currentStrategy is CrossRegionHedgingAvailabilityStrategy hedging + && hedging.IsSDKDefaultStrategyForPPAF; + + // Only stash customer-configured strategies. The SDK default can be regenerated + // deterministically from PPAF state, so re-stashing it would only cause confusion + // when reconciling later. + if (!currentIsSdkDefault && this.customerConfiguredAvailabilityStrategy == null) + { + this.customerConfiguredAvailabilityStrategy = currentStrategy; + } + this.ConnectionPolicy.AvailabilityStrategy = null; + DefaultTrace.TraceInformation( + "DocumentClient: Hedging disabled by Gateway property disableCrossRegionalHedging=true"); } + return; } - // Step 4: Update the user agent features. - this.ConnectionPolicy.UserAgentContainer.AppendFeatures(this.GetUserAgentFeatures()); + // disableCrossRegionalHedging == false: restore or rebuild the appropriate strategy. + if (this.customerConfiguredAvailabilityStrategy != null) + { + this.ConnectionPolicy.AvailabilityStrategy = this.customerConfiguredAvailabilityStrategy; + this.customerConfiguredAvailabilityStrategy = null; + DefaultTrace.TraceInformation( + "DocumentClient: Hedging re-enabled — restored customer-configured AvailabilityStrategy"); + return; + } - DefaultTrace.TraceInformation("DocumentClient: Successfully updated PPAF configuration dynamically"); + if (this.ConnectionPolicy.EnablePartitionLevelFailover && this.ConnectionPolicy.AvailabilityStrategy == null) + { + this.InitializePartitionLevelFailoverWithDefaultHedging(); + if (this.ConnectionPolicy.AvailabilityStrategy != null) + { + DefaultTrace.TraceInformation( + "DocumentClient: Hedging re-enabled — applied SDK default PPAF hedging strategy"); + } + return; + } + + if (!this.ConnectionPolicy.EnablePartitionLevelFailover + && this.ConnectionPolicy.AvailabilityStrategy is CrossRegionHedgingAvailabilityStrategy sdkDefault + && sdkDefault.IsSDKDefaultStrategyForPPAF) + { + // PPAF disabled — drop the SDK default we previously installed. + this.ConnectionPolicy.AvailabilityStrategy = null; + } } internal void CaptureSessionToken(DocumentServiceRequest request, DocumentServiceResponse response) diff --git a/Microsoft.Azure.Cosmos/src/Resource/Settings/AccountProperties.cs b/Microsoft.Azure.Cosmos/src/Resource/Settings/AccountProperties.cs index d0c80c6700..02040c34ba 100644 --- a/Microsoft.Azure.Cosmos/src/Resource/Settings/AccountProperties.cs +++ b/Microsoft.Azure.Cosmos/src/Resource/Settings/AccountProperties.cs @@ -248,6 +248,20 @@ internal long ProvisionedDocumentStorageInMB [JsonProperty(PropertyName = Constants.Properties.EnablePerPartitionFailoverBehavior)] internal bool? EnablePartitionLevelFailover { get; set; } + /// + /// Gets the gateway-controlled override that disables cross-regional hedging for this account. + /// + /// + /// When this flag is , the SDK disables all hedging (both SDK-default PPAF + /// hedging and any explicit customer-configured ) regardless + /// of any other configuration. When the flag is or + /// (absent from the Gateway response), existing hedging behavior is preserved. The flag is only + /// honored for accounts with PPAF enabled and is intended as an operational escape hatch — it is + /// not exposed through any public SDK API surface. + /// + [JsonProperty(PropertyName = "disableCrossRegionalHedging")] + internal bool? DisableCrossRegionalHedging { get; set; } + private IDictionary QueryStringToDictConverter() { if (!string.IsNullOrEmpty(this.QueryEngineConfigurationString)) diff --git a/Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs b/Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs index 8cadc2f6e8..f228816d82 100644 --- a/Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs +++ b/Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs @@ -41,12 +41,23 @@ internal class GlobalEndpointManager : IGlobalEndpointManager private readonly object isAccountRefreshInProgressLock = new object(); private bool isAccountRefreshInProgress = false; private bool isBackgroundAccountRefreshActive = false; - private DateTime LastBackgroundRefreshUtc = DateTime.MinValue; - - /// - /// Event that is raised when PPAF (Per Partition Automatic Failover) enablement status changes - /// - internal event Action? OnEnablePartitionLevelFailoverConfigChanged; + private DateTime LastBackgroundRefreshUtc = DateTime.MinValue; + + // Last observed value of the account-level disableCrossRegionalHedging flag. + // Tracked separately so the change event fires when only this flag toggles. + private bool lastKnownDisableCrossRegionalHedging = false; + + /// + /// Event that is raised when PPAF (Per Partition Automatic Failover) enablement status changes + /// or when the gateway-controlled disableCrossRegionalHedging flag toggles. + /// + /// + /// First argument is the latest EnablePartitionLevelFailover value observed from the + /// Gateway (falls back to the existing connection-policy value when the property is absent). + /// Second argument is the latest disableCrossRegionalHedging value (false when absent + /// from the Gateway response). + /// + internal event Action? OnEnablePartitionLevelFailoverConfigChanged; public GlobalEndpointManager( IDocumentClientInternal owner, @@ -608,10 +619,14 @@ public virtual void InitializeAccountPropertiesAndStartBackgroundRefresh(Account return; } - if (!this.connectionPolicy.DisablePartitionLevelFailoverClientLevelOverride && databaseAccount.EnablePartitionLevelFailover.HasValue) - { - this.connectionPolicy.EnablePartitionLevelFailover = databaseAccount.EnablePartitionLevelFailover.Value; - } + if (!this.connectionPolicy.DisablePartitionLevelFailoverClientLevelOverride && databaseAccount.EnablePartitionLevelFailover.HasValue) + { + this.connectionPolicy.EnablePartitionLevelFailover = databaseAccount.EnablePartitionLevelFailover.Value; + } + + // Capture initial disableCrossRegionalHedging baseline so the change-event only fires on + // subsequent transitions, not on the first observation. + this.lastKnownDisableCrossRegionalHedging = databaseAccount.DisableCrossRegionalHedging ?? false; GlobalEndpointManager.ParseThinClientLocationsFromAdditionalProperties(databaseAccount); @@ -768,13 +783,24 @@ private async Task RefreshDatabaseAccountInternalAsync(bool forceRefresh) try { this.LastBackgroundRefreshUtc = DateTime.UtcNow; - AccountProperties accountProperties = await this.GetDatabaseAccountAsync(true); - - if (!this.connectionPolicy.DisablePartitionLevelFailoverClientLevelOverride - && accountProperties.EnablePartitionLevelFailover.HasValue - && (this.connectionPolicy.EnablePartitionLevelFailover != accountProperties.EnablePartitionLevelFailover.Value)) - { - this.OnEnablePartitionLevelFailoverConfigChanged?.Invoke(accountProperties.EnablePartitionLevelFailover.Value); + AccountProperties accountProperties = await this.GetDatabaseAccountAsync(true); + + bool ignorePpafChanges = this.connectionPolicy.DisablePartitionLevelFailoverClientLevelOverride; + + bool ppafEnablementChanged = !ignorePpafChanges + && accountProperties.EnablePartitionLevelFailover.HasValue + && (this.connectionPolicy.EnablePartitionLevelFailover != accountProperties.EnablePartitionLevelFailover.Value); + + bool currentDisableHedgingFlag = accountProperties.DisableCrossRegionalHedging ?? false; + bool disableHedgingFlagChanged = !ignorePpafChanges + && (currentDisableHedgingFlag != this.lastKnownDisableCrossRegionalHedging); + + if (ppafEnablementChanged || disableHedgingFlagChanged) + { + bool latestPpafEnabled = accountProperties.EnablePartitionLevelFailover + ?? this.connectionPolicy.EnablePartitionLevelFailover; + this.lastKnownDisableCrossRegionalHedging = currentDisableHedgingFlag; + this.OnEnablePartitionLevelFailoverConfigChanged?.Invoke(latestPpafEnabled, currentDisableHedgingFlag); } GlobalEndpointManager.ParseThinClientLocationsFromAdditionalProperties(accountProperties); diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosJsonSerializerUnitTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosJsonSerializerUnitTests.cs index 3b4b9c1887..3fb1733fb2 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosJsonSerializerUnitTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosJsonSerializerUnitTests.cs @@ -67,7 +67,7 @@ public void ValidatePropertySerialization() string id = "testId"; this.TestProperty( id, - $@"{{""id"":""{id}"",""writableLocations"":[],""readableLocations"":[],""userConsistencyPolicy"":null,""addresses"":null,""userReplicationPolicy"":null,""systemReplicationPolicy"":null,""readPolicy"":null,""queryEngineConfiguration"":null,""enableMultipleWriteLocations"":false,""enablePerPartitionFailoverBehavior"":null,""enableNRegionSynchronousCommit"":false}}"); + $@"{{""id"":""{id}"",""writableLocations"":[],""readableLocations"":[],""userConsistencyPolicy"":null,""addresses"":null,""userReplicationPolicy"":null,""systemReplicationPolicy"":null,""readPolicy"":null,""queryEngineConfiguration"":null,""enableMultipleWriteLocations"":false,""enablePerPartitionFailoverBehavior"":null,""disableCrossRegionalHedging"":null,""enableNRegionSynchronousCommit"":false}}"); this.TestProperty( id, diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosSerializerCoreTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosSerializerCoreTests.cs index 00f00156fc..f865894418 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosSerializerCoreTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosSerializerCoreTests.cs @@ -143,7 +143,7 @@ public void ValidateCustomSerializerNotUsedForInternalTypes() this.TestProperty( serializerCore, id, - $@"{{""id"":""{id}"",""writableLocations"":[],""readableLocations"":[],""userConsistencyPolicy"":null,""addresses"":null,""userReplicationPolicy"":null,""systemReplicationPolicy"":null,""readPolicy"":null,""queryEngineConfiguration"":null,""enableMultipleWriteLocations"":false,""enablePerPartitionFailoverBehavior"":null,""enableNRegionSynchronousCommit"":false}}"); + $@"{{""id"":""{id}"",""writableLocations"":[],""readableLocations"":[],""userConsistencyPolicy"":null,""addresses"":null,""userReplicationPolicy"":null,""systemReplicationPolicy"":null,""readPolicy"":null,""queryEngineConfiguration"":null,""enableMultipleWriteLocations"":false,""enablePerPartitionFailoverBehavior"":null,""disableCrossRegionalHedging"":null,""enableNRegionSynchronousCommit"":false}}"); this.TestProperty( serializerCore, diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GatewayHedgingOverrideTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GatewayHedgingOverrideTests.cs new file mode 100644 index 0000000000..2b3ed56355 --- /dev/null +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GatewayHedgingOverrideTests.cs @@ -0,0 +1,233 @@ +namespace Microsoft.Azure.Cosmos.Tests +{ + using System; + using System.Reflection; + using Microsoft.Azure.Cosmos; + using Microsoft.VisualStudio.TestTools.UnitTesting; + using Newtonsoft.Json; + + /// + /// Unit tests for the Gateway-controlled disableCrossRegionalHedging flag on + /// and the resulting hedging strategy reconciliation in + /// . + /// + [TestClass] + public class GatewayHedgingOverrideTests + { + private const string AccountEndpoint = "https://localhost:8081/"; + private const string AccountKey = "C2y6yDjf5/R+ob0N8A7Cgv30VRDJIWEHLM+4QDU5DE2nQ9nDuVTqobD4b8mGGyPMbIZnqyMsEcaGQy67XIw/Jw=="; + + private static readonly FieldInfo DisableHedgingField = + typeof(DocumentClient).GetField("disableCrossRegionalHedging", BindingFlags.Instance | BindingFlags.NonPublic); + + private static readonly FieldInfo CustomerStrategyField = + typeof(DocumentClient).GetField("customerConfiguredAvailabilityStrategy", BindingFlags.Instance | BindingFlags.NonPublic); + + [TestMethod] + public void AccountProperties_DisableCrossRegionalHedging_True_Deserializes() + { + string json = "{\"id\":\"test\",\"disableCrossRegionalHedging\":true}"; + AccountProperties props = JsonConvert.DeserializeObject(json); + Assert.IsTrue(props.DisableCrossRegionalHedging.HasValue); + Assert.IsTrue(props.DisableCrossRegionalHedging.Value); + } + + [TestMethod] + public void AccountProperties_DisableCrossRegionalHedging_False_Deserializes() + { + string json = "{\"id\":\"test\",\"disableCrossRegionalHedging\":false}"; + AccountProperties props = JsonConvert.DeserializeObject(json); + Assert.IsTrue(props.DisableCrossRegionalHedging.HasValue); + Assert.IsFalse(props.DisableCrossRegionalHedging.Value); + } + + [TestMethod] + public void AccountProperties_DisableCrossRegionalHedging_Absent_DefaultsToNull() + { + string json = "{\"id\":\"test\"}"; + AccountProperties props = JsonConvert.DeserializeObject(json); + Assert.IsFalse(props.DisableCrossRegionalHedging.HasValue); + } + + [TestMethod] + public void AccountProperties_DisableCrossRegionalHedging_Unknown_FieldDoesNotBreakDeserialization() + { + string json = "{\"id\":\"test\",\"disableCrossRegionalHedging\":true,\"someFutureFlag\":\"foo\"}"; + AccountProperties props = JsonConvert.DeserializeObject(json); + Assert.IsTrue(props.DisableCrossRegionalHedging.GetValueOrDefault()); + } + + [TestMethod] + public void InitializePartitionLevelFailoverWithDefaultHedging_FlagTrue_SkipsApplyingDefaultStrategy() + { + DocumentClient client = CreateClient(new ConnectionPolicy { EnablePartitionLevelFailover = true }); + try + { + Assert.IsNull(client.ConnectionPolicy.AvailabilityStrategy, "Pre-condition: no strategy configured"); + + SetDisableHedgingFlag(client, true); + client.InitializePartitionLevelFailoverWithDefaultHedging(); + + Assert.IsNull( + client.ConnectionPolicy.AvailabilityStrategy, + "Default hedging must NOT be applied when disableCrossRegionalHedging=true"); + } + finally + { + client.Dispose(); + } + } + + [TestMethod] + public void InitializePartitionLevelFailoverWithDefaultHedging_FlagFalse_AppliesDefaultStrategy() + { + DocumentClient client = CreateClient(new ConnectionPolicy { EnablePartitionLevelFailover = true }); + try + { + SetDisableHedgingFlag(client, false); + client.InitializePartitionLevelFailoverWithDefaultHedging(); + + Assert.IsNotNull( + client.ConnectionPolicy.AvailabilityStrategy, + "Default PPAF hedging must be applied when disableCrossRegionalHedging=false"); + } + finally + { + client.Dispose(); + } + } + + [TestMethod] + public void UpdateConfig_FlagToggleOn_StashesCustomerStrategyAndClearsAvailabilityStrategy() + { + ConnectionPolicy policy = new ConnectionPolicy { EnablePartitionLevelFailover = true }; + CrossRegionHedgingAvailabilityStrategy customerStrategy = new CrossRegionHedgingAvailabilityStrategy( + threshold: TimeSpan.FromMilliseconds(500), + thresholdStep: TimeSpan.FromMilliseconds(100)); + policy.AvailabilityStrategy = customerStrategy; + + DocumentClient client = CreateClient(policy); + try + { + client.UpdatePartitionLevelFailoverConfigWithAccountRefresh(isEnabled: true, disableCrossRegionalHedging: true); + + Assert.IsNull(client.ConnectionPolicy.AvailabilityStrategy, "Strategy must be cleared when disable flag flips to true"); + Assert.AreSame( + customerStrategy, + GetCustomerStashedStrategy(client), + "Customer-configured strategy must be stashed for later restoration"); + } + finally + { + client.Dispose(); + } + } + + [TestMethod] + public void UpdateConfig_FlagToggleOff_RestoresStashedCustomerStrategy() + { + ConnectionPolicy policy = new ConnectionPolicy { EnablePartitionLevelFailover = true }; + CrossRegionHedgingAvailabilityStrategy customerStrategy = new CrossRegionHedgingAvailabilityStrategy( + threshold: TimeSpan.FromMilliseconds(500), + thresholdStep: TimeSpan.FromMilliseconds(100)); + policy.AvailabilityStrategy = customerStrategy; + + DocumentClient client = CreateClient(policy); + try + { + // Step 1: disable hedging — strategy is stashed. + client.UpdatePartitionLevelFailoverConfigWithAccountRefresh(isEnabled: true, disableCrossRegionalHedging: true); + Assert.IsNull(client.ConnectionPolicy.AvailabilityStrategy); + + // Step 2: re-enable hedging — strategy is restored. + client.UpdatePartitionLevelFailoverConfigWithAccountRefresh(isEnabled: true, disableCrossRegionalHedging: false); + + Assert.AreSame( + customerStrategy, + client.ConnectionPolicy.AvailabilityStrategy, + "Customer-configured strategy must be restored when the disable flag toggles back to false"); + Assert.IsNull(GetCustomerStashedStrategy(client), "Stash must be cleared after restoration"); + } + finally + { + client.Dispose(); + } + } + + [TestMethod] + public void UpdateConfig_ClientLevelOverrideDisabled_FlagIsIgnored() + { + ConnectionPolicy policy = new ConnectionPolicy + { + EnablePartitionLevelFailover = true, + DisablePartitionLevelFailoverClientLevelOverride = true, + }; + CrossRegionHedgingAvailabilityStrategy customerStrategy = new CrossRegionHedgingAvailabilityStrategy( + threshold: TimeSpan.FromMilliseconds(500), + thresholdStep: TimeSpan.FromMilliseconds(100)); + policy.AvailabilityStrategy = customerStrategy; + + DocumentClient client = CreateClient(policy); + try + { + client.UpdatePartitionLevelFailoverConfigWithAccountRefresh(isEnabled: true, disableCrossRegionalHedging: true); + + Assert.AreSame( + customerStrategy, + client.ConnectionPolicy.AvailabilityStrategy, + "When DisablePartitionLevelFailoverClientLevelOverride=true, the gateway flag must be ignored entirely"); + } + finally + { + client.Dispose(); + } + } + + [TestMethod] + public void UpdateConfig_FlagTrue_DoesNotStashSDKDefaultStrategy() + { + ConnectionPolicy policy = new ConnectionPolicy { EnablePartitionLevelFailover = true }; + DocumentClient client = CreateClient(policy); + try + { + // Apply SDK default first. + client.InitializePartitionLevelFailoverWithDefaultHedging(); + Assert.IsNotNull(client.ConnectionPolicy.AvailabilityStrategy); + Assert.IsTrue(((CrossRegionHedgingAvailabilityStrategy)client.ConnectionPolicy.AvailabilityStrategy).IsSDKDefaultStrategyForPPAF); + + // Now flip the flag — the SDK default should be cleared but NOT stashed. + client.UpdatePartitionLevelFailoverConfigWithAccountRefresh(isEnabled: true, disableCrossRegionalHedging: true); + + Assert.IsNull(client.ConnectionPolicy.AvailabilityStrategy); + Assert.IsNull( + GetCustomerStashedStrategy(client), + "SDK-default strategy must NOT be stashed — it can be regenerated deterministically"); + + // Toggle back off — SDK default is rebuilt from PPAF state. + client.UpdatePartitionLevelFailoverConfigWithAccountRefresh(isEnabled: true, disableCrossRegionalHedging: false); + + Assert.IsNotNull(client.ConnectionPolicy.AvailabilityStrategy); + Assert.IsTrue(((CrossRegionHedgingAvailabilityStrategy)client.ConnectionPolicy.AvailabilityStrategy).IsSDKDefaultStrategyForPPAF); + } + finally + { + client.Dispose(); + } + } + + private static DocumentClient CreateClient(ConnectionPolicy policy) + { + return new DocumentClient(new Uri(AccountEndpoint), AccountKey, policy); + } + + private static void SetDisableHedgingFlag(DocumentClient client, bool value) + { + DisableHedgingField.SetValue(client, value); + } + + private static AvailabilityStrategy GetCustomerStashedStrategy(DocumentClient client) + { + return (AvailabilityStrategy)CustomerStrategyField.GetValue(client); + } + } +} diff --git a/openspec/changes/ppaf-dynamic-hedging-control/.openspec.yaml b/openspec/changes/ppaf-dynamic-hedging-control/.openspec.yaml new file mode 100644 index 0000000000..2ca4bc8517 --- /dev/null +++ b/openspec/changes/ppaf-dynamic-hedging-control/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-03-24 diff --git a/openspec/changes/ppaf-dynamic-hedging-control/design.md b/openspec/changes/ppaf-dynamic-hedging-control/design.md new file mode 100644 index 0000000000..885541a653 --- /dev/null +++ b/openspec/changes/ppaf-dynamic-hedging-control/design.md @@ -0,0 +1,88 @@ +## Context + +PPAF (Partition-Level Failover) is an account-level feature that, when enabled, triggers automatic hedging of read requests with a default threshold of `Min(1000ms, RequestTimeout/2)` and a step of 500ms. The hedging is implemented via `CrossRegionHedgingAvailabilityStrategy`, which is either explicitly configured by the customer on `CosmosClientOptions.AvailabilityStrategy` or automatically created by the SDK in `DocumentClient.InitializePartitionLevelFailoverWithDefaultHedging()`. + +**Current hedging decision flow:** + +1. `RequestInvokerHandler` resolves the active `AvailabilityStrategy`: request-level override → client-level → null. +2. If a strategy is present and `Enabled()`, requests are dispatched through `CrossRegionHedgingAvailabilityStrategy.ExecuteAvailabilityStrategyAsync()`. +3. The strategy decides per-request whether to hedge based on resource type (Document only), operation type (reads always, writes only if multi-write enabled), and available regions. + +**Account properties refresh:** + +- `GlobalEndpointManager.RefreshDatabaseAccountInternalAsync()` periodically fetches `AccountProperties` from the Gateway. +- The `OnEnablePartitionLevelFailoverConfigChanged` event fires when PPAF status changes, triggering `DocumentClient.UpdatePartitionLevelFailoverConfigWithAccountRefresh()` to dynamically enable or disable default hedging. + +**Problem:** There is no mechanism for on-call engineers to temporarily disable hedging for a PPAF account without rolling back PPAF entirely. Rolling back PPAF is expensive and disrupts other benefits. A Gateway-controlled flag is needed as a targeted escape hatch. + +## Goals / Non-Goals + +**Goals:** + +- Add a new boolean account property (`disableCrossRegionalHedging`) read from Gateway responses. +- When the flag is `true`, disable all hedging (both SDK-default PPAF hedging and explicit customer-configured hedging) for that account. +- When the flag is `false` or absent, preserve existing hedging behavior. +- Support dynamic toggling: as clients observe refreshed account properties, hedging state updates accordingly. +- Keep the feature entirely internal — no new public API surface. + +**Non-Goals:** + +- Changing customer-authored hedging strategies or their configuration shape. +- Modifying PPAF enablement or onboarding flows. +- Supporting non-PPAF accounts with this flag (for the immediate term). +- Exposing the flag to end users or making it configurable from the SDK. + +## Decisions + +### 1. Property location: `AccountProperties` with `JsonExtensionData` fallback + +**Decision:** Add a strongly-typed `bool?` property `DisableCrossRegionalHedging` to `AccountProperties` with a `[JsonProperty]` attribute mapped to the Gateway JSON key `"disableCrossRegionalHedging"`. + +**Rationale:** This is consistent with how `EnablePartitionLevelFailover` is already modeled (Line 249 of `AccountProperties.cs`). A strongly-typed property provides compile-time safety and discoverability. The `[JsonExtensionData] AdditionalProperties` dictionary exists as a fallback for unknown fields, but relying on it would lose type safety and require manual parsing. + +**Alternative considered:** Reading from `AdditionalProperties` at evaluation time. Rejected because it introduces fragile string-keyed lookups and inconsistency with the existing pattern for account-level flags. + +### 2. Evaluation point: `GlobalEndpointManager` account-refresh callback + +**Decision:** Evaluate the `disableCrossRegionalHedging` flag in `DocumentClient.UpdatePartitionLevelFailoverConfigWithAccountRefresh()` — the same method that already handles dynamic PPAF enable/disable based on account-property changes. + +**Rationale:** This method is invoked whenever `GlobalEndpointManager` detects a change in PPAF-related account properties. Adding the hedging-disable check here ensures the flag is evaluated on every account-properties refresh, supporting dynamic toggling. It also consolidates all PPAF-related hedging logic in one place. + +**Alternative considered:** Evaluating the flag per-request in `RequestInvokerHandler`. Rejected because it would require propagating account properties into the hot path and adds unnecessary per-request overhead. The account-refresh callback runs infrequently and already handles strategy assignment. + +### 3. Enforcement mechanism: Strategy nullification / replacement + +**Decision:** When the flag is `true`, set `ConnectionPolicy.AvailabilityStrategy` to `null` (or a `DisabledAvailabilityStrategy` sentinel) to disable hedging. When the flag is toggled back to `false`, re-evaluate and restore the appropriate strategy (explicit customer config or PPAF default). + +**Rationale:** `RequestInvokerHandler` already treats a null strategy as "no hedging" (Line 97-98: `strategy != null && strategy.Enabled()`). Using the existing null-check path avoids introducing new conditional logic in the request hot path. The `DisabledAvailabilityStrategy` subclass already exists for explicit opt-out scenarios, though null assignment is simpler. + +**Alternative considered:** Adding an `IsDisabledByGateway` flag to `CrossRegionHedgingAvailabilityStrategy` and checking it in `Enabled()`. Rejected because it would couple the strategy to Gateway concerns and requires changes to the strategy hierarchy. + +### 4. Precedence: Gateway flag overrides all hedging configuration + +**Decision:** When the Gateway flag is `true`, it overrides both SDK-default PPAF hedging AND any explicit customer-configured `AvailabilityStrategy`. The precedence order becomes: + +1. Gateway `disableCrossRegionalHedging = true` → hedging OFF (highest priority) +2. Request-level `AvailabilityStrategy` override +3. Client-level `AvailabilityStrategy` (explicit customer config) +4. PPAF default hedging (if PPAF enabled, no explicit config) + +**Rationale:** The flag is an operational safety valve. If on-call determines hedging is causing issues, it must be a hard kill switch regardless of what the customer configured. This prevents scenarios where explicit customer hedging bypasses the mitigation. + +### 5. State tracking: Store flag and original strategy reference + +**Decision:** Introduce internal fields in `DocumentClient`: +- `bool disableCrossRegionalHedgingFlag` — cached value of the Gateway flag. +- `AvailabilityStrategy customerConfiguredStrategy` — the customer's original explicit strategy (if any), stored before nullification so it can be restored when the flag is toggled back. + +**Rationale:** The SDK must be able to restore the correct hedging behavior when the flag is turned off. Without storing the original strategy, the SDK cannot distinguish between "customer never set a strategy" and "strategy was removed by the flag." + +## Risks / Trade-offs + +- **[Risk] Flag latency** — The flag takes effect only after the next account-properties refresh (default interval is ~5 minutes). → **Mitigation:** On-call can force a faster refresh by restarting the client or waiting for the next refresh cycle. The existing `GlobalEndpointManager` refresh interval is sufficient for non-emergency toggles. + +- **[Risk] Customer confusion if explicit hedging silently disabled** — If a customer configured explicit hedging and on-call disables it via the flag, the customer may observe unexpected behavior. → **Mitigation:** The flag is intended as a temporary operational tool. On-call should communicate with the customer. SDK diagnostics/traces should log when hedging is disabled by the Gateway flag. + +- **[Risk] Strategy restoration correctness** — When restoring hedging after the flag is toggled off, the SDK must correctly reconstruct the PPAF default strategy or restore the customer's explicit strategy. → **Mitigation:** Store the original strategy reference before nullification. Unit-test the toggle cycle (enable → disable → re-enable). + +- **[Trade-off] Non-PPAF accounts ignored** — The flag is only evaluated for PPAF accounts. A future extension could support non-PPAF accounts, but this adds complexity without current demand. diff --git a/openspec/changes/ppaf-dynamic-hedging-control/proposal.md b/openspec/changes/ppaf-dynamic-hedging-control/proposal.md new file mode 100644 index 0000000000..7a92547c87 --- /dev/null +++ b/openspec/changes/ppaf-dynamic-hedging-control/proposal.md @@ -0,0 +1,28 @@ +## Why + +PPAF-enabled Cosmos DB accounts automatically enable hedging with a 1-second threshold to fast-track read-region failover. However, production incidents have shown that implicit hedging of long-running queries can cause unexpected exceptions (e.g., ArgumentException in CallStore). Rolling back PPAF entirely to disable hedging is operationally expensive and disrupts other PPAF benefits, so a targeted, service-side escape hatch is needed to let on-call engineers dynamically disable hedging without customer intervention. + +## What Changes + +- Introduce a new Gateway account property (`disableCrossRegionalHedging`) that the SDK reads from account-property responses. +- When the flag is `true`, hedging is disabled for the PPAF account regardless of any explicit or implicit hedging configuration. +- When the flag is `false` or absent, existing hedging behavior is preserved (explicit customer config honored; PPAF defaults applied if no explicit config). +- The SDK evaluates the flag dynamically on every account-properties refresh, enabling on-call toggle without customer code changes. +- Non-PPAF accounts ignore the flag. + +## Capabilities + +### New Capabilities +- `gateway-hedging-override`: Reads a new Gateway account property flag and enforces it as the highest-precedence control over PPAF hedging behavior, supporting dynamic enable/disable at the SDK layer. + +### Modified Capabilities + + +## Impact + +- **SDK Client layer** (`DocumentClient` / `CosmosClient` internals): hedging-decision logic must incorporate a new precedence check against the Gateway flag before evaluating explicit or default hedging configuration. +- **Account properties model**: new property deserialized from the Gateway response (`AccountProperties` or equivalent DTO). +- **Gateway / service dependency**: the flag is surfaced by the Cosmos DB Gateway; the SDK consumes it read-only. +- **No public API surface changes**: the feature is invisible to end users; no new `CosmosClientOptions` or request-options properties are exposed. +- **Testing**: unit tests for precedence rules; integration tests validating dynamic toggle via mocked account-property responses. diff --git a/openspec/changes/ppaf-dynamic-hedging-control/specs/gateway-hedging-override/spec.md b/openspec/changes/ppaf-dynamic-hedging-control/specs/gateway-hedging-override/spec.md new file mode 100644 index 0000000000..ea102b011e --- /dev/null +++ b/openspec/changes/ppaf-dynamic-hedging-control/specs/gateway-hedging-override/spec.md @@ -0,0 +1,141 @@ +## ADDED Requirements + +### Requirement: Gateway account property for hedging control +The `AccountProperties` model SHALL include a nullable boolean property `DisableCrossRegionalHedging` deserialized from the Gateway JSON key `"disableCrossRegionalHedging"`. The property SHALL default to `null` when absent from the Gateway response. + +#### Scenario: Gateway response includes the flag set to true +- **WHEN** the Gateway account-properties response contains `"disableCrossRegionalHedging": true` +- **THEN** the `AccountProperties.DisableCrossRegionalHedging` property SHALL be `true` + +#### Scenario: Gateway response includes the flag set to false +- **WHEN** the Gateway account-properties response contains `"disableCrossRegionalHedging": false` +- **THEN** the `AccountProperties.DisableCrossRegionalHedging` property SHALL be `false` + +#### Scenario: Gateway response does not include the flag +- **WHEN** the Gateway account-properties response does not contain the `"disableCrossRegionalHedging"` key +- **THEN** the `AccountProperties.DisableCrossRegionalHedging` property SHALL be `null` + +--- + +### Requirement: Gateway flag disables all hedging when true +When the Gateway flag `disableCrossRegionalHedging` is `true`, the SDK SHALL disable all hedging for PPAF-enabled accounts regardless of any explicit or implicit hedging configuration. + +#### Scenario: PPAF account with default hedging and flag set to true +- **WHEN** the account has PPAF enabled with SDK-default hedging active +- **AND** the Gateway flag `disableCrossRegionalHedging` is `true` +- **THEN** the SDK SHALL disable hedging +- **AND** requests SHALL NOT be hedged across regions + +#### Scenario: PPAF account with explicit customer hedging and flag set to true +- **WHEN** the account has PPAF enabled +- **AND** the customer has configured an explicit `AvailabilityStrategy` via `CosmosClientOptions` +- **AND** the Gateway flag `disableCrossRegionalHedging` is `true` +- **THEN** the SDK SHALL disable hedging +- **AND** the explicit customer strategy SHALL NOT be executed + +#### Scenario: PPAF account with request-level hedging override and flag set to true +- **WHEN** the account has PPAF enabled +- **AND** a request has a per-request `AvailabilityStrategy` override set in `RequestOptions` +- **AND** the Gateway flag `disableCrossRegionalHedging` is `true` +- **THEN** the SDK SHALL disable hedging for that request +- **AND** the request-level strategy SHALL NOT be executed + +--- + +### Requirement: Existing behavior preserved when flag is false or absent +When the Gateway flag `disableCrossRegionalHedging` is `false` or absent from the account-properties response, the SDK SHALL preserve existing hedging behavior without any change. + +#### Scenario: PPAF account with flag set to false and no explicit hedging +- **WHEN** the account has PPAF enabled +- **AND** the Gateway flag `disableCrossRegionalHedging` is `false` +- **AND** no explicit customer hedging configuration is set +- **THEN** the SDK SHALL enable the default PPAF hedging strategy with threshold `Min(1000ms, RequestTimeout/2)` and step `500ms` + +#### Scenario: PPAF account with flag absent and explicit hedging configured +- **WHEN** the account has PPAF enabled +- **AND** the Gateway flag `disableCrossRegionalHedging` is absent from the response +- **AND** the customer has configured an explicit `AvailabilityStrategy` +- **THEN** the SDK SHALL honor the customer's explicit hedging configuration + +#### Scenario: PPAF account with flag set to false and explicit hedging configured +- **WHEN** the account has PPAF enabled +- **AND** the Gateway flag `disableCrossRegionalHedging` is `false` +- **AND** the customer has configured an explicit `AvailabilityStrategy` +- **THEN** the SDK SHALL honor the customer's explicit hedging configuration + +--- + +### Requirement: Dynamic toggling via account-properties refresh +The SDK SHALL evaluate the Gateway flag on each account-properties refresh cycle and dynamically enable or disable hedging as the flag value changes, without requiring client restart. + +#### Scenario: Flag toggled from false to true during runtime +- **WHEN** the Gateway flag `disableCrossRegionalHedging` was `false` (or absent) at client initialization +- **AND** hedging was active (default or explicit) +- **AND** the Gateway flag is changed to `true` +- **AND** the SDK observes the updated account properties via the next refresh cycle +- **THEN** the SDK SHALL disable hedging + +#### Scenario: Flag toggled from true to false during runtime +- **WHEN** the Gateway flag `disableCrossRegionalHedging` was `true` and hedging was disabled +- **AND** the Gateway flag is changed to `false` +- **AND** the SDK observes the updated account properties via the next refresh cycle +- **THEN** the SDK SHALL re-enable hedging using the appropriate strategy +- **AND** if the customer had configured an explicit strategy, that strategy SHALL be restored +- **AND** if no explicit strategy was configured, the SDK-default PPAF hedging strategy SHALL be applied + +#### Scenario: Flag toggled from true to false with no prior explicit strategy +- **WHEN** the Gateway flag `disableCrossRegionalHedging` transitions from `true` to `false` +- **AND** the customer did not configure an explicit `AvailabilityStrategy` +- **AND** the account has PPAF enabled +- **THEN** the SDK SHALL re-enable the default PPAF hedging strategy + +--- + +### Requirement: Non-PPAF accounts ignore the flag +The SDK SHALL NOT evaluate or act on the `disableCrossRegionalHedging` flag for accounts that do not have PPAF enabled. + +#### Scenario: Non-PPAF account with flag set to true +- **WHEN** the account does NOT have PPAF enabled (`EnablePartitionLevelFailover` is `false` or absent) +- **AND** the Gateway flag `disableCrossRegionalHedging` is `true` +- **THEN** the SDK SHALL ignore the flag +- **AND** any explicit customer hedging configuration SHALL continue to function normally + +#### Scenario: Non-PPAF account with explicit hedging and flag set to true +- **WHEN** the account does NOT have PPAF enabled +- **AND** the customer has configured an explicit `AvailabilityStrategy` +- **AND** the Gateway flag `disableCrossRegionalHedging` is `true` +- **THEN** the SDK SHALL NOT disable the customer's explicit hedging strategy + +--- + +### Requirement: Feature is invisible to end users +The Gateway hedging override flag SHALL NOT be exposed through any public SDK API surface. There SHALL be no new public properties on `CosmosClientOptions`, `RequestOptions`, or any other user-facing type related to this flag. + +#### Scenario: No public API surface for the flag +- **WHEN** a developer inspects the public API of `CosmosClientOptions`, `ItemRequestOptions`, `QueryRequestOptions`, or `ChangeFeedRequestOptions` +- **THEN** there SHALL be no property or method related to `disableCrossRegionalHedging` + +#### Scenario: Diagnostics logging when hedging is disabled by flag +- **WHEN** hedging is disabled due to the Gateway flag being `true` +- **THEN** the SDK SHALL include a trace or diagnostic entry indicating that hedging was disabled by a Gateway account property +- **AND** this information SHALL be available in SDK diagnostics for supportability + +--- + +### Requirement: Precedence rules for hedging evaluation +The SDK SHALL evaluate hedging configuration using the following strict precedence order: +1. Gateway `disableCrossRegionalHedging = true` → hedging OFF (highest priority) +2. If Gateway flag is `false` or absent → evaluate existing rules (request-level override → client-level strategy → PPAF default) + +#### Scenario: Gateway flag true takes precedence over all other configuration +- **WHEN** the Gateway flag `disableCrossRegionalHedging` is `true` +- **AND** the customer has configured an explicit `AvailabilityStrategy` at the client level +- **AND** a request has a per-request `AvailabilityStrategy` override +- **THEN** the SDK SHALL disable hedging for that request +- **AND** neither the client-level nor request-level strategy SHALL be executed + +#### Scenario: Gateway flag false defers to existing precedence +- **WHEN** the Gateway flag `disableCrossRegionalHedging` is `false` +- **AND** the customer has configured an explicit `AvailabilityStrategy` at the client level +- **AND** a request has a per-request `AvailabilityStrategy` override +- **THEN** the request-level strategy SHALL be used (existing precedence preserved) diff --git a/openspec/changes/ppaf-dynamic-hedging-control/tasks.md b/openspec/changes/ppaf-dynamic-hedging-control/tasks.md new file mode 100644 index 0000000000..60c452b896 --- /dev/null +++ b/openspec/changes/ppaf-dynamic-hedging-control/tasks.md @@ -0,0 +1,38 @@ +## 1. Account Properties Model + +- [ ] 1.1 Add `DisableCrossRegionalHedging` nullable bool property to `AccountProperties.cs` with `[JsonProperty("disableCrossRegionalHedging")]` attribute, following the same pattern as `EnablePartitionLevelFailover` +- [ ] 1.2 Add unit tests for `AccountProperties` deserialization: flag present as `true`, flag present as `false`, and flag absent from JSON + +## 2. DocumentClient State Tracking + +- [ ] 2.1 Add internal field `disableCrossRegionalHedgingFlag` (bool, default false) to `DocumentClient` to cache the current Gateway flag value +- [ ] 2.2 Add internal field to store the customer's original explicit `AvailabilityStrategy` reference (if any) so it can be restored when the flag is toggled back to false + +## 3. Hedging Evaluation in Account-Refresh Callback + +- [ ] 3.1 Modify `GlobalEndpointManager.RefreshDatabaseAccountInternalAsync()` (or the existing PPAF-change callback) to propagate the `DisableCrossRegionalHedging` value to `DocumentClient` +- [ ] 3.2 Modify `DocumentClient.UpdatePartitionLevelFailoverConfigWithAccountRefresh()` to evaluate the Gateway flag: when `true`, store the current strategy and set `ConnectionPolicy.AvailabilityStrategy` to null/disabled; when `false` or absent, restore the appropriate strategy +- [ ] 3.3 Ensure the flag is also evaluated during initial client setup in `DocumentClient.InitializePartitionLevelFailoverWithDefaultHedging()` — if the flag is `true` at initialization time, do not enable default hedging + +## 4. Request-Level Enforcement + +- [ ] 4.1 Update `RequestInvokerHandler` hedging resolution logic to check the Gateway disable flag before evaluating request-level or client-level `AvailabilityStrategy` overrides, ensuring the flag takes absolute precedence when `true` + +## 5. Diagnostics and Tracing + +- [ ] 5.1 Add a trace/diagnostic log entry when hedging is disabled due to the Gateway flag, including the flag value and the action taken (e.g., "Hedging disabled by Gateway account property disableCrossRegionalHedging=true") +- [ ] 5.2 Add a trace/diagnostic log entry when hedging is re-enabled after the flag is toggled back to false + +## 6. Unit Tests + +- [ ] 6.1 Test: PPAF account with default hedging — flag `true` disables hedging, flag toggled to `false` re-enables default hedging +- [ ] 6.2 Test: PPAF account with explicit customer hedging — flag `true` disables hedging, flag toggled to `false` restores customer strategy +- [ ] 6.3 Test: PPAF account with request-level hedging override — flag `true` prevents request-level strategy execution +- [ ] 6.4 Test: Non-PPAF account — flag `true` does not affect explicit customer hedging +- [ ] 6.5 Test: Flag absent from account properties — existing behavior unchanged +- [ ] 6.6 Test: Dynamic toggle cycle — enable → disable → re-enable with correct strategy restoration + +## 7. Integration Verification + +- [ ] 7.1 Validate end-to-end with mocked Gateway responses containing the `disableCrossRegionalHedging` flag in integration/emulator tests +- [ ] 7.2 Verify no public API surface changes — confirm `DisableCrossRegionalHedging` is internal only and not exposed on `CosmosClientOptions`, `RequestOptions`, or related types From 816e334fcec043112093ba8b75048a93d0f38846 Mon Sep 17 00:00:00 2001 From: Nalu Tripician Date: Fri, 8 May 2026 10:47:30 -0700 Subject: [PATCH 2/6] Routing: Refactors PPAF Gateway hedging override per code-review feedback Addresses review comments on PR #5829: - #3 Renames shadowing parameters in UpdatePartitionLevelFailoverConfigWithAccountRefresh (isEnabled -> latestIsEnabled, disableCrossRegionalHedging -> latestDisableCrossRegionalHedging) so a future edit dropping a 'this.' prefix cannot silently change semantics. - #6 Adds early-return guard when neither PPAF enablement nor the hedging flag changed, protecting future direct callers from spurious AvailabilityStrategy mutation. - #5 Introduces hedgingStrategyLock and serializes the (flag, stash, ConnectionPolicy.AvailabilityStrategy) mutation sequence in UpdatePartitionLevelFailoverConfigWithAccountRefresh and ApplyHedgingStrategyForCurrentState, so concurrent invocations cannot interleave stash/clear/restore steps. - #11 Emits a TraceWarning when ApplyHedgingStrategyForCurrentState would silently drop a non-default AvailabilityStrategy because a previously-stashed customer strategy is still held. - #8 Replaces reflection-based field access in GatewayHedgingOverrideTests with internal test-only properties (DisableCrossRegionalHedgingForTests, CustomerConfiguredAvailabilityStrategyForTests), so renames are caught at compile time. - #2 / #12 Moves the init-time flag capture and customer-strategy stash from GetInitializationTaskAsync into InitializeGatewayConfigurationReaderAsync, before the GEM background-refresh loop is started and before the change-event handler is subscribed. This eliminates the theoretical race where a refresh-driven event could be overwritten by a subsequent init-time field assignment, and removes the inconsistent null-check style. - #10 Removes unrelated .coding-harness/ entry from .gitignore. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .gitignore | 3 - Microsoft.Azure.Cosmos/src/DocumentClient.cs | 249 ++++++++++++------ .../GatewayHedgingOverrideTests.cs | 25 +- 3 files changed, 175 insertions(+), 102 deletions(-) diff --git a/.gitignore b/.gitignore index faa8cc41a9..a02d350b97 100644 --- a/.gitignore +++ b/.gitignore @@ -331,6 +331,3 @@ ASALocalRun/ # Visual Studio Code files .vscode/ - -.coding-harness/ - diff --git a/Microsoft.Azure.Cosmos/src/DocumentClient.cs b/Microsoft.Azure.Cosmos/src/DocumentClient.cs index 9fd3ddbd31..dbf783aeb5 100644 --- a/Microsoft.Azure.Cosmos/src/DocumentClient.cs +++ b/Microsoft.Azure.Cosmos/src/DocumentClient.cs @@ -136,6 +136,14 @@ internal partial class DocumentClient : IDisposable, IAuthorizationTokenProvider internal readonly AuthorizationTokenProvider cosmosAuthorization; private readonly bool isThinClientFeatureFlagEnabled = ConfigurationManager.IsThinClientEnabled(defaultValue: false); + + // Serializes the (disableCrossRegionalHedging, customerConfiguredAvailabilityStrategy, + // ConnectionPolicy.AvailabilityStrategy) mutation sequence performed by the gateway-driven + // hedging-override reconcile path. Without this, two concurrent invocations of + // ApplyHedgingStrategyForCurrentState (e.g., a future caller racing the GEM background-refresh + // thread) could interleave their stash/clear/restore steps and leave the SDK in an inconsistent state. + private readonly object hedgingStrategyLock = new object(); + internal bool isThinClientEnabled; // Gateway has backoff/retry logic to hide transient errors. @@ -176,11 +184,14 @@ internal partial class DocumentClient : IDisposable, IAuthorizationTokenProvider // Gateway-controlled override that disables all cross-regional hedging for PPAF accounts. // Mirrors AccountProperties.DisableCrossRegionalHedging from the most recent account-properties refresh. + // Always read/written under hedgingStrategyLock from the refresh path; per-request reads of the + // mirrored ConnectionPolicy.AvailabilityStrategy rely on atomic reference assignment. private bool disableCrossRegionalHedging; // When the gateway disable flag is true, the customer's explicit AvailabilityStrategy // (if any) is stashed here so it can be restored verbatim if the flag is later toggled back to false. // Null when the customer never configured a strategy or when no stash is currently held. + // Mutated only under hedgingStrategyLock. private AvailabilityStrategy customerConfiguredAvailabilityStrategy; // creator of TransportClient is responsible for disposing it. @@ -1072,21 +1083,6 @@ private async Task GetInitializationTaskAsync(IStoreClientFactory storeCli this.ConnectionPolicy.EnablePartitionLevelFailover = this.accountServiceConfiguration.AccountProperties.EnablePartitionLevelFailover.Value; } - // Capture the initial gateway disableCrossRegionalHedging flag and stash any customer-configured - // AvailabilityStrategy so it can be restored if the flag is later toggled back to false. - if (!this.ConnectionPolicy.DisablePartitionLevelFailoverClientLevelOverride - && this.accountServiceConfiguration?.AccountProperties != null) - { - this.disableCrossRegionalHedging = this.accountServiceConfiguration.AccountProperties.DisableCrossRegionalHedging ?? false; - if (this.disableCrossRegionalHedging && this.ConnectionPolicy.AvailabilityStrategy != null) - { - this.customerConfiguredAvailabilityStrategy = this.ConnectionPolicy.AvailabilityStrategy; - this.ConnectionPolicy.AvailabilityStrategy = null; - DefaultTrace.TraceInformation( - "DocumentClient: Hedging disabled at initialization by Gateway property disableCrossRegionalHedging=true"); - } - } - this.isThinClientEnabled = this.isThinClientFeatureFlagEnabled && (this.ConnectionPolicy.ConnectionMode == ConnectionMode.Gateway) && (this.accountServiceConfiguration.AccountProperties?.ThinClientWritableLocationsInternal?.Count ?? 0) > 0; this.ConnectionPolicy.EnablePartitionLevelCircuitBreaker |= this.ConnectionPolicy.EnablePartitionLevelFailover; @@ -6892,8 +6888,30 @@ private async Task InitializeGatewayConfigurationReaderAsync() await this.accountServiceConfiguration.InitializeAsync(); AccountProperties accountProperties = this.accountServiceConfiguration.AccountProperties; this.UseMultipleWriteLocations = this.ConnectionPolicy.UseMultipleWriteLocations && accountProperties.EnableMultipleWriteLocations; - this.GlobalEndpointManager.InitializeAccountPropertiesAndStartBackgroundRefresh(accountProperties); + + // Capture the initial gateway disableCrossRegionalHedging flag and stash any customer-configured + // AvailabilityStrategy so it can be restored if the flag is later toggled back to false. + // This must run BEFORE the GEM background refresh loop is started and BEFORE the change-event + // handler is subscribed: if either happens first, a refresh-driven event firing concurrently with + // initialization could otherwise be overwritten here, leaving the SDK with stale state and a + // mismatched GEM baseline that would never re-fire. + if (!this.ConnectionPolicy.DisablePartitionLevelFailoverClientLevelOverride) + { + lock (this.hedgingStrategyLock) + { + this.disableCrossRegionalHedging = accountProperties.DisableCrossRegionalHedging ?? false; + if (this.disableCrossRegionalHedging && this.ConnectionPolicy.AvailabilityStrategy != null) + { + this.customerConfiguredAvailabilityStrategy = this.ConnectionPolicy.AvailabilityStrategy; + this.ConnectionPolicy.AvailabilityStrategy = null; + DefaultTrace.TraceInformation( + "DocumentClient: Hedging disabled at initialization by Gateway property disableCrossRegionalHedging=true"); + } + } + } + this.GlobalEndpointManager.OnEnablePartitionLevelFailoverConfigChanged += this.UpdatePartitionLevelFailoverConfigWithAccountRefresh; + this.GlobalEndpointManager.InitializeAccountPropertiesAndStartBackgroundRefresh(accountProperties); } internal string GetUserAgentFeatures() @@ -6958,8 +6976,8 @@ internal void InitializePartitionLevelFailoverWithDefaultHedging() } internal void UpdatePartitionLevelFailoverConfigWithAccountRefresh( - bool isEnabled, - bool disableCrossRegionalHedging) + bool latestIsEnabled, + bool latestDisableCrossRegionalHedging) { // Only update if client-level override is not disabled if (this.ConnectionPolicy.DisablePartitionLevelFailoverClientLevelOverride) @@ -6968,46 +6986,58 @@ internal void UpdatePartitionLevelFailoverConfigWithAccountRefresh( return; } - bool ppafEnablementChanged = this.ConnectionPolicy.EnablePartitionLevelFailover != isEnabled; - bool hedgingFlagChanged = this.disableCrossRegionalHedging != disableCrossRegionalHedging; - - if (ppafEnablementChanged) + lock (this.hedgingStrategyLock) { - DefaultTrace.TraceInformation( - "DocumentClient: PPAF Account Level Config Updated. Updating EnablePartitionLevelFailover to {0}", - isEnabled); + bool ppafEnablementChanged = this.ConnectionPolicy.EnablePartitionLevelFailover != latestIsEnabled; + bool hedgingFlagChanged = this.disableCrossRegionalHedging != latestDisableCrossRegionalHedging; - // Step 1: Enable partition level failover. - this.PartitionKeyRangeLocation.SetIsPPAFEnabled(isEnabled); - this.ConnectionPolicy.EnablePartitionLevelFailover = isEnabled; + // No-op when nothing has actually changed. In production GEM only fires the event on + // transitions, but this method is internal and may be invoked directly (e.g., from tests + // or future callers); without this guard, ApplyHedgingStrategyForCurrentState would still + // run and could clear an SDK-default strategy that was correctly installed. + if (!ppafEnablementChanged && !hedgingFlagChanged) + { + return; + } - // Step 2: Enable partition level circuit breaker. - this.PartitionKeyRangeLocation.SetIsPPCBEnabled(isEnabled); - this.ConnectionPolicy.EnablePartitionLevelCircuitBreaker = isEnabled; - } + if (ppafEnablementChanged) + { + DefaultTrace.TraceInformation( + "DocumentClient: PPAF Account Level Config Updated. Updating EnablePartitionLevelFailover to {0}", + latestIsEnabled); - if (hedgingFlagChanged) - { - DefaultTrace.TraceInformation( - "DocumentClient: Gateway disableCrossRegionalHedging flag changed to {0}", - disableCrossRegionalHedging); - this.disableCrossRegionalHedging = disableCrossRegionalHedging; - } + // Step 1: Enable partition level failover. + this.PartitionKeyRangeLocation.SetIsPPAFEnabled(latestIsEnabled); + this.ConnectionPolicy.EnablePartitionLevelFailover = latestIsEnabled; - // Step 3: Reconcile the AvailabilityStrategy with the latest account state. The gateway - // disable flag has the highest precedence — when true, hedging is OFF regardless of any - // explicit or default configuration. When false, restore the customer's explicit strategy - // (if previously stashed) or apply the SDK default for PPAF. - this.ApplyHedgingStrategyForCurrentState(); + // Step 2: Enable partition level circuit breaker. + this.PartitionKeyRangeLocation.SetIsPPCBEnabled(latestIsEnabled); + this.ConnectionPolicy.EnablePartitionLevelCircuitBreaker = latestIsEnabled; + } - if (ppafEnablementChanged) - { - // Step 4: Update the user agent features. Hedging-flag-only changes do not affect the - // PPAF-related user-agent feature flags. - this.ConnectionPolicy.UserAgentContainer.AppendFeatures(this.GetUserAgentFeatures()); - } + if (hedgingFlagChanged) + { + DefaultTrace.TraceInformation( + "DocumentClient: Gateway disableCrossRegionalHedging flag changed to {0}", + latestDisableCrossRegionalHedging); + this.disableCrossRegionalHedging = latestDisableCrossRegionalHedging; + } + + // Step 3: Reconcile the AvailabilityStrategy with the latest account state. The gateway + // disable flag has the highest precedence — when true, hedging is OFF regardless of any + // explicit or default configuration. When false, restore the customer's explicit strategy + // (if previously stashed) or apply the SDK default for PPAF. + this.ApplyHedgingStrategyForCurrentState(); + + if (ppafEnablementChanged) + { + // Step 4: Update the user agent features. Hedging-flag-only changes do not affect the + // PPAF-related user-agent feature flags. + this.ConnectionPolicy.UserAgentContainer.AppendFeatures(this.GetUserAgentFeatures()); + } - DefaultTrace.TraceInformation("DocumentClient: Successfully updated PPAF configuration dynamically"); + DefaultTrace.TraceInformation("DocumentClient: Successfully updated PPAF configuration dynamically"); + } } /// @@ -7020,59 +7050,112 @@ internal void UpdatePartitionLevelFailoverConfigWithAccountRefresh( /// 2. Customer explicitly configured a strategy: keep / restore that strategy. /// 3. PPAF enabled with no explicit strategy: apply SDK default PPAF hedging. /// 4. PPAF disabled: clear any SDK-default strategy that we previously installed. + /// Caller must hold ; the entry-point lock is reentrant so this + /// method also acquires it for direct invocations from tests / future callers. /// private void ApplyHedgingStrategyForCurrentState() { - if (this.disableCrossRegionalHedging) + lock (this.hedgingStrategyLock) { - AvailabilityStrategy currentStrategy = this.ConnectionPolicy.AvailabilityStrategy; - if (currentStrategy != null) + if (this.disableCrossRegionalHedging) + { + AvailabilityStrategy currentStrategy = this.ConnectionPolicy.AvailabilityStrategy; + if (currentStrategy != null) + { + bool currentIsSdkDefault = currentStrategy is CrossRegionHedgingAvailabilityStrategy hedging + && hedging.IsSDKDefaultStrategyForPPAF; + + // Only stash customer-configured strategies. The SDK default can be regenerated + // deterministically from PPAF state, so re-stashing it would only cause confusion + // when reconciling later. + if (!currentIsSdkDefault) + { + if (this.customerConfiguredAvailabilityStrategy == null) + { + this.customerConfiguredAvailabilityStrategy = currentStrategy; + } + else if (!ReferenceEquals(this.customerConfiguredAvailabilityStrategy, currentStrategy)) + { + // A previously-stashed customer strategy is still held while a different + // non-default strategy is currently on ConnectionPolicy. Silently dropping + // the new strategy would lose customer configuration, so surface this loudly + // — it indicates a re-entrant or otherwise unexpected code path. + DefaultTrace.TraceWarning( + "DocumentClient: ApplyHedgingStrategyForCurrentState observed a non-default " + + "AvailabilityStrategy while a previously-stashed customer strategy is still held; " + + "the current strategy will be cleared without being stashed. This may indicate a " + + "re-entrant code path."); + } + } + + this.ConnectionPolicy.AvailabilityStrategy = null; + DefaultTrace.TraceInformation( + "DocumentClient: Hedging disabled by Gateway property disableCrossRegionalHedging=true"); + } + return; + } + + // disableCrossRegionalHedging == false: restore or rebuild the appropriate strategy. + if (this.customerConfiguredAvailabilityStrategy != null) { - bool currentIsSdkDefault = currentStrategy is CrossRegionHedgingAvailabilityStrategy hedging - && hedging.IsSDKDefaultStrategyForPPAF; + this.ConnectionPolicy.AvailabilityStrategy = this.customerConfiguredAvailabilityStrategy; + this.customerConfiguredAvailabilityStrategy = null; + DefaultTrace.TraceInformation( + "DocumentClient: Hedging re-enabled — restored customer-configured AvailabilityStrategy"); + return; + } - // Only stash customer-configured strategies. The SDK default can be regenerated - // deterministically from PPAF state, so re-stashing it would only cause confusion - // when reconciling later. - if (!currentIsSdkDefault && this.customerConfiguredAvailabilityStrategy == null) + if (this.ConnectionPolicy.EnablePartitionLevelFailover && this.ConnectionPolicy.AvailabilityStrategy == null) + { + this.InitializePartitionLevelFailoverWithDefaultHedging(); + if (this.ConnectionPolicy.AvailabilityStrategy != null) { - this.customerConfiguredAvailabilityStrategy = currentStrategy; + DefaultTrace.TraceInformation( + "DocumentClient: Hedging re-enabled — applied SDK default PPAF hedging strategy"); } + return; + } + if (!this.ConnectionPolicy.EnablePartitionLevelFailover + && this.ConnectionPolicy.AvailabilityStrategy is CrossRegionHedgingAvailabilityStrategy sdkDefault + && sdkDefault.IsSDKDefaultStrategyForPPAF) + { + // PPAF disabled — drop the SDK default we previously installed. this.ConnectionPolicy.AvailabilityStrategy = null; - DefaultTrace.TraceInformation( - "DocumentClient: Hedging disabled by Gateway property disableCrossRegionalHedging=true"); } - return; } + } - // disableCrossRegionalHedging == false: restore or rebuild the appropriate strategy. - if (this.customerConfiguredAvailabilityStrategy != null) + // Test-only accessors. Visible to the unit-test assembly via [InternalsVisibleTo] in + // Microsoft.Azure.Cosmos.csproj. Used in lieu of reflection so renames or refactors of the + // backing fields are caught at compile time rather than blowing up with NullReferenceException + // from a stale System.Reflection.FieldInfo at test runtime. + internal bool DisableCrossRegionalHedgingForTests + { + get { - this.ConnectionPolicy.AvailabilityStrategy = this.customerConfiguredAvailabilityStrategy; - this.customerConfiguredAvailabilityStrategy = null; - DefaultTrace.TraceInformation( - "DocumentClient: Hedging re-enabled — restored customer-configured AvailabilityStrategy"); - return; + lock (this.hedgingStrategyLock) + { + return this.disableCrossRegionalHedging; + } } - - if (this.ConnectionPolicy.EnablePartitionLevelFailover && this.ConnectionPolicy.AvailabilityStrategy == null) + set { - this.InitializePartitionLevelFailoverWithDefaultHedging(); - if (this.ConnectionPolicy.AvailabilityStrategy != null) + lock (this.hedgingStrategyLock) { - DefaultTrace.TraceInformation( - "DocumentClient: Hedging re-enabled — applied SDK default PPAF hedging strategy"); + this.disableCrossRegionalHedging = value; } - return; } + } - if (!this.ConnectionPolicy.EnablePartitionLevelFailover - && this.ConnectionPolicy.AvailabilityStrategy is CrossRegionHedgingAvailabilityStrategy sdkDefault - && sdkDefault.IsSDKDefaultStrategyForPPAF) + internal AvailabilityStrategy CustomerConfiguredAvailabilityStrategyForTests + { + get { - // PPAF disabled — drop the SDK default we previously installed. - this.ConnectionPolicy.AvailabilityStrategy = null; + lock (this.hedgingStrategyLock) + { + return this.customerConfiguredAvailabilityStrategy; + } } } diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GatewayHedgingOverrideTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GatewayHedgingOverrideTests.cs index 2b3ed56355..ff9d96c77a 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GatewayHedgingOverrideTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GatewayHedgingOverrideTests.cs @@ -1,7 +1,6 @@ -namespace Microsoft.Azure.Cosmos.Tests +namespace Microsoft.Azure.Cosmos.Tests { using System; - using System.Reflection; using Microsoft.Azure.Cosmos; using Microsoft.VisualStudio.TestTools.UnitTesting; using Newtonsoft.Json; @@ -17,12 +16,6 @@ public class GatewayHedgingOverrideTests private const string AccountEndpoint = "https://localhost:8081/"; private const string AccountKey = "C2y6yDjf5/R+ob0N8A7Cgv30VRDJIWEHLM+4QDU5DE2nQ9nDuVTqobD4b8mGGyPMbIZnqyMsEcaGQy67XIw/Jw=="; - private static readonly FieldInfo DisableHedgingField = - typeof(DocumentClient).GetField("disableCrossRegionalHedging", BindingFlags.Instance | BindingFlags.NonPublic); - - private static readonly FieldInfo CustomerStrategyField = - typeof(DocumentClient).GetField("customerConfiguredAvailabilityStrategy", BindingFlags.Instance | BindingFlags.NonPublic); - [TestMethod] public void AccountProperties_DisableCrossRegionalHedging_True_Deserializes() { @@ -109,7 +102,7 @@ public void UpdateConfig_FlagToggleOn_StashesCustomerStrategyAndClearsAvailabili DocumentClient client = CreateClient(policy); try { - client.UpdatePartitionLevelFailoverConfigWithAccountRefresh(isEnabled: true, disableCrossRegionalHedging: true); + client.UpdatePartitionLevelFailoverConfigWithAccountRefresh(latestIsEnabled: true, latestDisableCrossRegionalHedging: true); Assert.IsNull(client.ConnectionPolicy.AvailabilityStrategy, "Strategy must be cleared when disable flag flips to true"); Assert.AreSame( @@ -136,11 +129,11 @@ public void UpdateConfig_FlagToggleOff_RestoresStashedCustomerStrategy() try { // Step 1: disable hedging — strategy is stashed. - client.UpdatePartitionLevelFailoverConfigWithAccountRefresh(isEnabled: true, disableCrossRegionalHedging: true); + client.UpdatePartitionLevelFailoverConfigWithAccountRefresh(latestIsEnabled: true, latestDisableCrossRegionalHedging: true); Assert.IsNull(client.ConnectionPolicy.AvailabilityStrategy); // Step 2: re-enable hedging — strategy is restored. - client.UpdatePartitionLevelFailoverConfigWithAccountRefresh(isEnabled: true, disableCrossRegionalHedging: false); + client.UpdatePartitionLevelFailoverConfigWithAccountRefresh(latestIsEnabled: true, latestDisableCrossRegionalHedging: false); Assert.AreSame( customerStrategy, @@ -170,7 +163,7 @@ public void UpdateConfig_ClientLevelOverrideDisabled_FlagIsIgnored() DocumentClient client = CreateClient(policy); try { - client.UpdatePartitionLevelFailoverConfigWithAccountRefresh(isEnabled: true, disableCrossRegionalHedging: true); + client.UpdatePartitionLevelFailoverConfigWithAccountRefresh(latestIsEnabled: true, latestDisableCrossRegionalHedging: true); Assert.AreSame( customerStrategy, @@ -196,7 +189,7 @@ public void UpdateConfig_FlagTrue_DoesNotStashSDKDefaultStrategy() Assert.IsTrue(((CrossRegionHedgingAvailabilityStrategy)client.ConnectionPolicy.AvailabilityStrategy).IsSDKDefaultStrategyForPPAF); // Now flip the flag — the SDK default should be cleared but NOT stashed. - client.UpdatePartitionLevelFailoverConfigWithAccountRefresh(isEnabled: true, disableCrossRegionalHedging: true); + client.UpdatePartitionLevelFailoverConfigWithAccountRefresh(latestIsEnabled: true, latestDisableCrossRegionalHedging: true); Assert.IsNull(client.ConnectionPolicy.AvailabilityStrategy); Assert.IsNull( @@ -204,7 +197,7 @@ public void UpdateConfig_FlagTrue_DoesNotStashSDKDefaultStrategy() "SDK-default strategy must NOT be stashed — it can be regenerated deterministically"); // Toggle back off — SDK default is rebuilt from PPAF state. - client.UpdatePartitionLevelFailoverConfigWithAccountRefresh(isEnabled: true, disableCrossRegionalHedging: false); + client.UpdatePartitionLevelFailoverConfigWithAccountRefresh(latestIsEnabled: true, latestDisableCrossRegionalHedging: false); Assert.IsNotNull(client.ConnectionPolicy.AvailabilityStrategy); Assert.IsTrue(((CrossRegionHedgingAvailabilityStrategy)client.ConnectionPolicy.AvailabilityStrategy).IsSDKDefaultStrategyForPPAF); @@ -222,12 +215,12 @@ private static DocumentClient CreateClient(ConnectionPolicy policy) private static void SetDisableHedgingFlag(DocumentClient client, bool value) { - DisableHedgingField.SetValue(client, value); + client.DisableCrossRegionalHedgingForTests = value; } private static AvailabilityStrategy GetCustomerStashedStrategy(DocumentClient client) { - return (AvailabilityStrategy)CustomerStrategyField.GetValue(client); + return client.CustomerConfiguredAvailabilityStrategyForTests; } } } From d9041cae438bbf853b281a42911a29e5d526fa11 Mon Sep 17 00:00:00 2001 From: Nalu Tripician Date: Thu, 14 May 2026 13:42:14 -0700 Subject: [PATCH 3/6] Routing: Refactors PPAF Gateway hedging override per second-round review feedback Addresses the 2026-05-14 review comments from @kundadebdatta on PR #5829: - AccountProperties.DisableCrossRegionalHedging: adds a TODO noting that the JSON property name should move to Constants.Properties once the underlying constant ships in the Microsoft.Azure.Cosmos.Direct package. Also drops the stale `only honored for accounts with PPAF enabled'' phrase from the doc remarks, matching the design clarification that the flag applies to both PPAF and non-PPAF accounts. - DocumentClient.hedgingStrategyLock: expands the field-level comment to call out that the lock is NOT load-bearing under current callers (GEM serializes refreshes; init runs before subscription) and exists specifically for the internal test accessors and as future-proofing for new direct callers. - UpdatePartitionLevelFailoverConfigWithAccountRefresh: expands the comments on the no-change early-return guard and on the unconditional ApplyHedgingStrategyForCurrentState() call to make their rationale explicit (guard exists for direct callers / unit tests; reconcile is intentionally shared between the ppafEnablementChanged and hedgingFlagChanged paths because PPAF toggles also need to install or drop the SDK default). - UpdatePartitionLevelFailoverConfigWithAccountRefresh: adds a matching `Successfully reconciled hedging strategy dynamically'' trace for hedging-flag-only transitions, and moves the PPAF `Successfully updated PPAF configuration dynamically'' trace inside the ppafEnablementChanged branch so each transition logs exactly one terminal trace. - DisableCrossRegionalHedgingForTests / CustomerConfiguredAvailabilityStrategyForTests: expands the comment to explain why these accessors exist instead of mocking AccountProperties (no current mocking seam past GatewayAccountReader.InitializeReaderAsync, which performs real I/O) and notes that they should be removed once such a seam is introduced. - changelog.md: adds an [Unreleased] section with the PR #5829 entry under Added. No behavioral changes -- comments, traces, and the changelog entry only. GatewayHedgingOverrideTests (10 tests) continue to pass. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Microsoft.Azure.Cosmos/src/DocumentClient.cs | 85 +++++++++++++++---- .../Resource/Settings/AccountProperties.cs | 9 +- changelog.md | 6 ++ 3 files changed, 82 insertions(+), 18 deletions(-) diff --git a/Microsoft.Azure.Cosmos/src/DocumentClient.cs b/Microsoft.Azure.Cosmos/src/DocumentClient.cs index dbf783aeb5..5bfd53f03f 100644 --- a/Microsoft.Azure.Cosmos/src/DocumentClient.cs +++ b/Microsoft.Azure.Cosmos/src/DocumentClient.cs @@ -139,9 +139,27 @@ internal partial class DocumentClient : IDisposable, IAuthorizationTokenProvider // Serializes the (disableCrossRegionalHedging, customerConfiguredAvailabilityStrategy, // ConnectionPolicy.AvailabilityStrategy) mutation sequence performed by the gateway-driven - // hedging-override reconcile path. Without this, two concurrent invocations of - // ApplyHedgingStrategyForCurrentState (e.g., a future caller racing the GEM background-refresh - // thread) could interleave their stash/clear/restore steps and leave the SDK in an inconsistent state. + // hedging-override reconcile path. + // + // NOTE: This lock is NOT load-bearing under the current set of callers — the GlobalEndpointManager + // serializes its account-properties refreshes, and the init-time stash in + // InitializeGatewayConfigurationReaderAsync runs strictly before the background-refresh loop is + // started and before OnEnablePartitionLevelFailoverConfigChanged is subscribed. So in production + // today there is no real concurrency hazard to defend against here. + // + // The lock exists for two narrower reasons: + // 1. Internal test accessors (DisableCrossRegionalHedgingForTests, + // CustomerConfiguredAvailabilityStrategyForTests) can read/write the same state directly + // from arbitrary test threads, so the accessors and the reconcile path need to agree on + // a single mutex. + // 2. Future-proofing: UpdatePartitionLevelFailoverConfigWithAccountRefresh and + // ApplyHedgingStrategyForCurrentState are internal/private but reachable from any new + // caller that wires them up. If a future change ever invokes them off the GEM-serialized + // path, the stash/clear/restore sequence stays atomic instead of regressing silently. + // + // Per-request reads of ConnectionPolicy.AvailabilityStrategy from RequestInvokerHandler continue + // to rely on atomic reference assignment; they observe either the pre- or post-transition strategy, + // never a half-applied state, and intentionally do not take this lock. private readonly object hedgingStrategyLock = new object(); internal bool isThinClientEnabled; @@ -6991,10 +7009,18 @@ internal void UpdatePartitionLevelFailoverConfigWithAccountRefresh( bool ppafEnablementChanged = this.ConnectionPolicy.EnablePartitionLevelFailover != latestIsEnabled; bool hedgingFlagChanged = this.disableCrossRegionalHedging != latestDisableCrossRegionalHedging; - // No-op when nothing has actually changed. In production GEM only fires the event on - // transitions, but this method is internal and may be invoked directly (e.g., from tests - // or future callers); without this guard, ApplyHedgingStrategyForCurrentState would still - // run and could clear an SDK-default strategy that was correctly installed. + // No-op when nothing has actually changed. + // + // In production this branch is unreachable: GlobalEndpointManager's + // RefreshLocationAsync only fires OnEnablePartitionLevelFailoverConfigChanged when + // either EnablePartitionLevelFailover or lastKnownDisableCrossRegionalHedging + // transitions, so the callback is invoked only on a real change. + // + // The guard is defense-in-depth for direct callers of this internal method — + // primarily unit tests that exercise the reconcile logic without going through the + // GEM event, and any future caller that wires up its own invocation. Without it, + // ApplyHedgingStrategyForCurrentState would still run on a no-change call and could + // clear an SDK-default strategy that was correctly installed. if (!ppafEnablementChanged && !hedgingFlagChanged) { return; @@ -7023,10 +7049,17 @@ internal void UpdatePartitionLevelFailoverConfigWithAccountRefresh( this.disableCrossRegionalHedging = latestDisableCrossRegionalHedging; } - // Step 3: Reconcile the AvailabilityStrategy with the latest account state. The gateway - // disable flag has the highest precedence — when true, hedging is OFF regardless of any - // explicit or default configuration. When false, restore the customer's explicit strategy - // (if previously stashed) or apply the SDK default for PPAF. + // Step 3: Reconcile the AvailabilityStrategy with the latest account state. + // + // Note: this call is intentionally outside the `if (hedgingFlagChanged)` block above + // because reconciliation is also required when PPAF enablement toggles without the + // hedging flag changing. Specifically: + // • PPAF transitioned off → drop the SDK-default strategy we previously installed. + // • PPAF transitioned on with no customer strategy → install the SDK default. + // The early-return at the top of the method already guarantees we get here only when + // at least one of (ppafEnablementChanged, hedgingFlagChanged) is true, so this call + // is never wasted. The gateway disable flag has the highest precedence — when true, + // hedging is OFF regardless of any explicit or default configuration. this.ApplyHedgingStrategyForCurrentState(); if (ppafEnablementChanged) @@ -7034,9 +7067,16 @@ internal void UpdatePartitionLevelFailoverConfigWithAccountRefresh( // Step 4: Update the user agent features. Hedging-flag-only changes do not affect the // PPAF-related user-agent feature flags. this.ConnectionPolicy.UserAgentContainer.AppendFeatures(this.GetUserAgentFeatures()); + + DefaultTrace.TraceInformation("DocumentClient: Successfully updated PPAF configuration dynamically"); } - DefaultTrace.TraceInformation("DocumentClient: Successfully updated PPAF configuration dynamically"); + if (hedgingFlagChanged) + { + DefaultTrace.TraceInformation( + "DocumentClient: Successfully reconciled hedging strategy dynamically (disableCrossRegionalHedging={0})", + latestDisableCrossRegionalHedging); + } } } @@ -7127,9 +7167,24 @@ private void ApplyHedgingStrategyForCurrentState() } // Test-only accessors. Visible to the unit-test assembly via [InternalsVisibleTo] in - // Microsoft.Azure.Cosmos.csproj. Used in lieu of reflection so renames or refactors of the - // backing fields are caught at compile time rather than blowing up with NullReferenceException - // from a stale System.Reflection.FieldInfo at test runtime. + // Microsoft.Azure.Cosmos.csproj. + // + // Why these exist rather than mocking AccountProperties: + // • The "real-time" path that populates this state runs through + // CosmosAccountServiceConfiguration.InitializeAsync → GatewayAccountReader.InitializeReaderAsync, + // which currently performs a real HTTPS account-properties read against the configured + // endpoint and has no injection seam for a fake account snapshot. Pre-populating these + // fields via AccountProperties is therefore not achievable from a pure unit test today. + // • These accessors let the unit tests pin the (disableCrossRegionalHedging, + // customerConfiguredAvailabilityStrategy) precondition directly and then invoke the + // reconcile entry points (UpdatePartitionLevelFailoverConfigWithAccountRefresh, + // ApplyHedgingStrategyForCurrentState) to verify the transition logic without any I/O. + // • They are preferred over System.Reflection because renames or refactors of the backing + // fields are caught at compile time, rather than blowing up at test runtime with + // NullReferenceException from a stale FieldInfo cache. + // + // Once a proper IGatewayAccountReader / IAccountServiceConfiguration mocking seam is + // introduced these accessors should be removed in favor of mocking the account snapshot. internal bool DisableCrossRegionalHedgingForTests { get diff --git a/Microsoft.Azure.Cosmos/src/Resource/Settings/AccountProperties.cs b/Microsoft.Azure.Cosmos/src/Resource/Settings/AccountProperties.cs index 02040c34ba..096268a89c 100644 --- a/Microsoft.Azure.Cosmos/src/Resource/Settings/AccountProperties.cs +++ b/Microsoft.Azure.Cosmos/src/Resource/Settings/AccountProperties.cs @@ -255,10 +255,13 @@ internal long ProvisionedDocumentStorageInMB /// When this flag is , the SDK disables all hedging (both SDK-default PPAF /// hedging and any explicit customer-configured ) regardless /// of any other configuration. When the flag is or - /// (absent from the Gateway response), existing hedging behavior is preserved. The flag is only - /// honored for accounts with PPAF enabled and is intended as an operational escape hatch — it is - /// not exposed through any public SDK API surface. + /// (absent from the Gateway response), existing hedging behavior is preserved. The flag is intended + /// as an operational escape hatch and is not exposed through any public SDK API surface. /// + // TODO: The JSON property name is hard-coded here because the corresponding constant has not yet been + // published in the Microsoft.Azure.Cosmos.Direct package referenced by this SDK. Once Direct is updated + // to expose this name on Constants.Properties, refactor this attribute to read from + // Constants.Properties. for consistency with the other AccountProperties JSON bindings. [JsonProperty(PropertyName = "disableCrossRegionalHedging")] internal bool? DisableCrossRegionalHedging { get; set; } diff --git a/changelog.md b/changelog.md index a0ad03bf50..2cf445fc9b 100644 --- a/changelog.md +++ b/changelog.md @@ -15,6 +15,12 @@ Preview features are treated as a separate branch and will not be included in th The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +### [Unreleased] + +#### Added + +- [5829](https://github.com/Azure/azure-cosmos-dotnet-v3/pull/5829) Routing: Adds Gateway Account Property Flag for Dynamic Hedging Control. Introduces a Gateway-controlled `disableCrossRegionalHedging` account property that lets operators dynamically disable PPAF cross-region hedging (and any customer-configured `AvailabilityStrategy`) without rolling back PPAF entirely. The SDK reconciles `ConnectionPolicy.AvailabilityStrategy` against the Gateway-supplied flag on each account-properties refresh; toggling the flag back to `false` restores the customer-configured strategy or rebuilds the SDK default. + ### [3.60.0-preview.0](https://www.nuget.org/packages/Microsoft.Azure.Cosmos/3.60.0-preview.0) - 2026-4-24 #### Added From 6433c5afbc67c0ee31772737c760c604a3443ec2 Mon Sep 17 00:00:00 2001 From: Nalu Tripician Date: Fri, 15 May 2026 14:18:41 -0700 Subject: [PATCH 4/6] Routing: Refactors PPAF gateway hedging override per third-round review Round 2 review feedback (kushagraThapar, kundadebdatta) on PR #5829: * RequestInvokerHandler.AvailabilityStrategy now short-circuits to null when the gateway flag is true, giving the operator override absolute precedence over per-request and client-level AvailabilityStrategy (K-1, K-5). * DocumentClient gains an internal IsHedgingDisabledByGateway accessor that reads the cached flag under hedgingStrategyLock. * GlobalEndpointManager.RefreshDatabaseAccountInternalAsync now guards the hedging change-detection with HasValue (mirrors the existing PPAF guard). An absent property is treated as "no signal" rather than implicit false, so a transient property-drop response cannot silently re-enable hedging during the exact window the operator most wants it disabled. The cached lastKnownDisableCrossRegionalHedging is preserved across null reads (K-2). * Specs/design/proposal updated: the flag now applies to both PPAF and non-PPAF accounts; the only documented bypass is a customer-configured client-level AvailabilityStrategy (K-3). Added Cross-SDK Parity section pointing at the Java tracking issue. * Tests: - GatewayHedgingOverrideTests: RequestInvokerHandler precedence (flag-true returns null even with per-request strategy; flag-false honors the per-request strategy verbatim). - GlobalEndpointManagerTest: property-drop-after-true does NOT fire the change event; explicit-false-after-true DOES fire it. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Microsoft.Azure.Cosmos/src/DocumentClient.cs | 18 +++ .../src/Handler/RequestInvokerHandler.cs | 14 ++- .../src/Routing/GlobalEndpointManager.cs | 23 +++- .../GatewayHedgingOverrideTests.cs | 88 +++++++++++++ .../GlobalEndpointManagerTest.cs | 117 ++++++++++++++++++ .../ppaf-dynamic-hedging-control/design.md | 3 +- .../ppaf-dynamic-hedging-control/proposal.md | 9 +- .../specs/gateway-hedging-override/spec.md | 19 +-- .../ppaf-dynamic-hedging-control/tasks.md | 6 +- 9 files changed, 277 insertions(+), 20 deletions(-) diff --git a/Microsoft.Azure.Cosmos/src/DocumentClient.cs b/Microsoft.Azure.Cosmos/src/DocumentClient.cs index 5bfd53f03f..18e0e2e96d 100644 --- a/Microsoft.Azure.Cosmos/src/DocumentClient.cs +++ b/Microsoft.Azure.Cosmos/src/DocumentClient.cs @@ -7166,6 +7166,24 @@ private void ApplyHedgingStrategyForCurrentState() } } + /// + /// Atomic read of the cached Gateway disableCrossRegionalHedging flag, taken under + /// . Used by + /// to enforce the operator-override precedence over per-request and client-level strategy: + /// when the Gateway flag is true, hedging is OFF for every request on this client, + /// regardless of where the strategy was configured. + /// + internal bool IsHedgingDisabledByGateway + { + get + { + lock (this.hedgingStrategyLock) + { + return this.disableCrossRegionalHedging; + } + } + } + // Test-only accessors. Visible to the unit-test assembly via [InternalsVisibleTo] in // Microsoft.Azure.Cosmos.csproj. // diff --git a/Microsoft.Azure.Cosmos/src/Handler/RequestInvokerHandler.cs b/Microsoft.Azure.Cosmos/src/Handler/RequestInvokerHandler.cs index 168e280bfd..a0ff12adf4 100644 --- a/Microsoft.Azure.Cosmos/src/Handler/RequestInvokerHandler.cs +++ b/Microsoft.Azure.Cosmos/src/Handler/RequestInvokerHandler.cs @@ -124,12 +124,24 @@ public override async Task SendAsync( /// /// This method determines if there is an availability strategy that the request can use. - /// Note that the request level availability strategy options override the client level options. + /// Note that the request level availability strategy options override the client level options, + /// but the Gateway-driven operator override () + /// takes absolute precedence over both — when the Gateway flag + /// disableCrossRegionalHedging is true, hedging is OFF for every request on this + /// client regardless of where the strategy was configured. /// /// /// whether the request should be a parallel hedging request. public AvailabilityStrategyInternal AvailabilityStrategy(RequestMessage request) { + // Gateway-driven operator override has absolute precedence over any request-level or + // client-level AvailabilityStrategy. See spec.md → "Gateway flag disables all hedging + // when true" and tasks.md item 4.1. + if (this.client.DocumentClient.IsHedgingDisabledByGateway) + { + return null; + } + AvailabilityStrategy strategy = request.RequestOptions?.AvailabilityStrategy ?? this.client.DocumentClient.ConnectionPolicy.AvailabilityStrategy; diff --git a/Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs b/Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs index f228816d82..ed88cf4166 100644 --- a/Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs +++ b/Microsoft.Azure.Cosmos/src/Routing/GlobalEndpointManager.cs @@ -791,16 +791,31 @@ private async Task RefreshDatabaseAccountInternalAsync(bool forceRefresh) && accountProperties.EnablePartitionLevelFailover.HasValue && (this.connectionPolicy.EnablePartitionLevelFailover != accountProperties.EnablePartitionLevelFailover.Value); - bool currentDisableHedgingFlag = accountProperties.DisableCrossRegionalHedging ?? false; + // Hedging change-detection mirrors the PPAF .HasValue guard above: + // a missing property in the response is "no signal", NOT an implicit false. + // This prevents a transient gateway response that drops the property + // (e.g., partial regional failover, stale gateway version) from being + // interpreted as a true -> false transition that re-enables hedging + // during the very window the operator most wants it disabled. + // + // Runbook contract: on-call disables via an explicit "false" property + // value, not by removing the property override. bool disableHedgingFlagChanged = !ignorePpafChanges - && (currentDisableHedgingFlag != this.lastKnownDisableCrossRegionalHedging); + && accountProperties.DisableCrossRegionalHedging.HasValue + && (accountProperties.DisableCrossRegionalHedging.Value != this.lastKnownDisableCrossRegionalHedging); if (ppafEnablementChanged || disableHedgingFlagChanged) { bool latestPpafEnabled = accountProperties.EnablePartitionLevelFailover ?? this.connectionPolicy.EnablePartitionLevelFailover; - this.lastKnownDisableCrossRegionalHedging = currentDisableHedgingFlag; - this.OnEnablePartitionLevelFailoverConfigChanged?.Invoke(latestPpafEnabled, currentDisableHedgingFlag); + + // Only advance lastKnown when the gateway emitted an explicit value; otherwise + // preserve the cached value so a later property-restored response diffs against + // the previously-honored state (rather than against an implicit false baseline). + bool latestDisableHedging = accountProperties.DisableCrossRegionalHedging + ?? this.lastKnownDisableCrossRegionalHedging; + this.lastKnownDisableCrossRegionalHedging = latestDisableHedging; + this.OnEnablePartitionLevelFailoverConfigChanged?.Invoke(latestPpafEnabled, latestDisableHedging); } GlobalEndpointManager.ParseThinClientLocationsFromAdditionalProperties(accountProperties); diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GatewayHedgingOverrideTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GatewayHedgingOverrideTests.cs index ff9d96c77a..37f1045843 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GatewayHedgingOverrideTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GatewayHedgingOverrideTests.cs @@ -1,7 +1,10 @@ namespace Microsoft.Azure.Cosmos.Tests { using System; + using System.Net.Http; using Microsoft.Azure.Cosmos; + using Microsoft.Azure.Cosmos.Handlers; + using Microsoft.Azure.Documents; using Microsoft.VisualStudio.TestTools.UnitTesting; using Newtonsoft.Json; @@ -208,6 +211,91 @@ public void UpdateConfig_FlagTrue_DoesNotStashSDKDefaultStrategy() } } + /// + /// Regression test: when the Gateway flag is true on the underlying + /// , + /// MUST return null even if the request itself supplies a per-request + /// via . + /// This is the absolute-precedence guarantee called out in the spec — the operator's + /// kill-switch wins over both per-request and client-level configuration. + /// + [TestMethod] + public void RequestInvokerHandler_FlagTrue_PerRequestStrategy_ReturnsNull() + { + using CosmosClient mockCosmosClient = MockCosmosUtil.CreateMockCosmosClient(); + mockCosmosClient.DocumentClient.DisableCrossRegionalHedgingForTests = true; + + RequestInvokerHandler handler = new RequestInvokerHandler( + mockCosmosClient, + requestedClientConsistencyLevel: null, + requestedClientReadConsistencyStrategy: null, + requestedClientPriorityLevel: null, + requestedClientThroughputBucket: null); + + using RequestMessage request = new RequestMessage( + HttpMethod.Get, + new Uri("/dbs/testdb/colls/testcontainer/docs/testId", UriKind.Relative)) + { + ResourceType = ResourceType.Document, + OperationType = OperationType.Read, + RequestOptions = new RequestOptions + { + AvailabilityStrategy = new CrossRegionHedgingAvailabilityStrategy( + threshold: TimeSpan.FromMilliseconds(500), + thresholdStep: TimeSpan.FromMilliseconds(100)) + } + }; + + AvailabilityStrategyInternal resolved = handler.AvailabilityStrategy(request); + + Assert.IsNull( + resolved, + "Gateway operator override must take absolute precedence over per-request AvailabilityStrategy"); + } + + /// + /// Companion to : + /// when the Gateway flag is false, the per-request + /// MUST be honored — the override only suppresses hedging while the flag is true. + /// + [TestMethod] + public void RequestInvokerHandler_FlagFalse_HonorsPerRequestStrategy() + { + using CosmosClient mockCosmosClient = MockCosmosUtil.CreateMockCosmosClient(); + mockCosmosClient.DocumentClient.DisableCrossRegionalHedgingForTests = false; + + RequestInvokerHandler handler = new RequestInvokerHandler( + mockCosmosClient, + requestedClientConsistencyLevel: null, + requestedClientReadConsistencyStrategy: null, + requestedClientPriorityLevel: null, + requestedClientThroughputBucket: null); + + CrossRegionHedgingAvailabilityStrategy perRequestStrategy = new CrossRegionHedgingAvailabilityStrategy( + threshold: TimeSpan.FromMilliseconds(500), + thresholdStep: TimeSpan.FromMilliseconds(100)); + + using RequestMessage request = new RequestMessage( + HttpMethod.Get, + new Uri("/dbs/testdb/colls/testcontainer/docs/testId", UriKind.Relative)) + { + ResourceType = ResourceType.Document, + OperationType = OperationType.Read, + RequestOptions = new RequestOptions + { + AvailabilityStrategy = perRequestStrategy + } + }; + + AvailabilityStrategyInternal resolved = handler.AvailabilityStrategy(request); + + Assert.IsNotNull(resolved, "Per-request strategy must be honored when Gateway flag is false"); + Assert.AreSame( + perRequestStrategy, + resolved, + "Per-request strategy must be returned verbatim when Gateway flag is false"); + } + private static DocumentClient CreateClient(ConnectionPolicy policy) { return new DocumentClient(new Uri(AccountEndpoint), AccountKey, policy); diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GlobalEndpointManagerTest.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GlobalEndpointManagerTest.cs index c61aa1c756..e88a4ea591 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GlobalEndpointManagerTest.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GlobalEndpointManagerTest.cs @@ -542,6 +542,123 @@ void TraceHandler(string message) Assert.IsTrue(isExceptionLogged, "The exception was logged as a warning trace event."); } + /// + /// Regression test for the operator-only kill-switch property-drop scenario. + /// + /// When the Gateway first reports disableCrossRegionalHedging=true and then a later + /// account-refresh response omits the property entirely (HasValue == false), the change + /// event MUST NOT fire — an absent property is "no signal", not an implicit transition to + /// false. Re-enabling hedging silently on a property drop would defeat the entire purpose + /// of the kill-switch during the exact window the operator most wants it disabled + /// (transient gateway response shapes, partial regional failovers, stale gateway versions). + /// + /// The runbook contract is "to disable, set the property to explicit false" — never by + /// removing it. + /// + [TestMethod] + public async Task RefreshDatabaseAccount_DisableCrossRegionalHedging_PropertyDroppedAfterTrue_DoesNotFireChangeEvent() + { + // Arrange — initial AccountProperties with disableCrossRegionalHedging=true. + AccountProperties initialAccount = new AccountProperties + { + ReadLocationsInternal = new Collection + { + new AccountRegion { Name = "Region0", Endpoint = "https://region0.documents.azure.com/" } + }, + DisableCrossRegionalHedging = true + }; + + // Refresh response drops the property entirely (HasValue == false). Read locations are + // preserved so the refresh path itself completes successfully. + AccountProperties refreshAccount = new AccountProperties + { + ReadLocationsInternal = new Collection + { + new AccountRegion { Name = "Region0", Endpoint = "https://region0.documents.azure.com/" } + } + // DisableCrossRegionalHedging intentionally NOT set (null). + }; + + Mock mockOwner = new Mock(); + mockOwner.Setup(owner => owner.ServiceEndpoint).Returns(new Uri("https://defaultendpoint.net/")); + mockOwner.Setup(owner => owner.GetDatabaseAccountInternalAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(refreshAccount); + + ConnectionPolicy connectionPolicy = new ConnectionPolicy(); + + using GlobalEndpointManager gem = new GlobalEndpointManager(mockOwner.Object, connectionPolicy); + + // Seed the baseline: lastKnownDisableCrossRegionalHedging = true. + gem.InitializeAccountPropertiesAndStartBackgroundRefresh(initialAccount); + + int changeEventInvocations = 0; + gem.OnEnablePartitionLevelFailoverConfigChanged += (_, _) => Interlocked.Increment(ref changeEventInvocations); + + // Act — force a refresh that observes the property-dropped response. + await gem.RefreshLocationAsync(forceRefresh: true); + + // Assert — change event must NOT fire on a true -> null transition. + Assert.AreEqual( + 0, + changeEventInvocations, + "Dropping the disableCrossRegionalHedging property from the gateway response must not be treated as a transition to false. The kill-switch is only released by an explicit 'false' value."); + } + + /// + /// Positive companion to : + /// when the gateway emits an explicit false after a prior true, the change + /// event MUST fire so the SDK can restore hedging. This validates the .HasValue + /// guard is gating only absent values, not explicit ones. + /// + [TestMethod] + public async Task RefreshDatabaseAccount_DisableCrossRegionalHedging_ExplicitFalseAfterTrue_FiresChangeEvent() + { + AccountProperties initialAccount = new AccountProperties + { + ReadLocationsInternal = new Collection + { + new AccountRegion { Name = "Region0", Endpoint = "https://region0.documents.azure.com/" } + }, + DisableCrossRegionalHedging = true + }; + + AccountProperties refreshAccount = new AccountProperties + { + ReadLocationsInternal = new Collection + { + new AccountRegion { Name = "Region0", Endpoint = "https://region0.documents.azure.com/" } + }, + DisableCrossRegionalHedging = false + }; + + Mock mockOwner = new Mock(); + mockOwner.Setup(owner => owner.ServiceEndpoint).Returns(new Uri("https://defaultendpoint.net/")); + mockOwner.Setup(owner => owner.GetDatabaseAccountInternalAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(refreshAccount); + + ConnectionPolicy connectionPolicy = new ConnectionPolicy(); + + using GlobalEndpointManager gem = new GlobalEndpointManager(mockOwner.Object, connectionPolicy); + gem.InitializeAccountPropertiesAndStartBackgroundRefresh(initialAccount); + + int changeEventInvocations = 0; + bool? observedDisableValue = null; + gem.OnEnablePartitionLevelFailoverConfigChanged += (_, disableHedging) => + { + Interlocked.Increment(ref changeEventInvocations); + observedDisableValue = disableHedging; + }; + + await gem.RefreshLocationAsync(forceRefresh: true); + + Assert.AreEqual( + 1, + changeEventInvocations, + "Explicit transition from disableCrossRegionalHedging=true to false must fire the change event exactly once."); + Assert.IsTrue(observedDisableValue.HasValue && observedDisableValue.Value == false, + "Change event must propagate the new explicit false value."); + } + private sealed class GetAccountRequestInjector { public Func ShouldFailRequest { get; set; } diff --git a/openspec/changes/ppaf-dynamic-hedging-control/design.md b/openspec/changes/ppaf-dynamic-hedging-control/design.md index 885541a653..deeedbb412 100644 --- a/openspec/changes/ppaf-dynamic-hedging-control/design.md +++ b/openspec/changes/ppaf-dynamic-hedging-control/design.md @@ -29,7 +29,6 @@ PPAF (Partition-Level Failover) is an account-level feature that, when enabled, - Changing customer-authored hedging strategies or their configuration shape. - Modifying PPAF enablement or onboarding flows. -- Supporting non-PPAF accounts with this flag (for the immediate term). - Exposing the flag to end users or making it configurable from the SDK. ## Decisions @@ -85,4 +84,4 @@ PPAF (Partition-Level Failover) is an account-level feature that, when enabled, - **[Risk] Strategy restoration correctness** — When restoring hedging after the flag is toggled off, the SDK must correctly reconstruct the PPAF default strategy or restore the customer's explicit strategy. → **Mitigation:** Store the original strategy reference before nullification. Unit-test the toggle cycle (enable → disable → re-enable). -- **[Trade-off] Non-PPAF accounts ignored** — The flag is only evaluated for PPAF accounts. A future extension could support non-PPAF accounts, but this adds complexity without current demand. +- **[Trade-off] Customer client-level opt-out bypasses the flag** — Customers who set `CosmosClientOptions.DisablePartitionLevelFailover` (surfaced internally as `DisablePartitionLevelFailoverClientLevelOverride`) opt out of all PPAF-related machinery including this Gateway override. This is consistent with the rest of the PPAF control plane and avoids surprising customers who have explicitly disabled the feature. diff --git a/openspec/changes/ppaf-dynamic-hedging-control/proposal.md b/openspec/changes/ppaf-dynamic-hedging-control/proposal.md index 7a92547c87..f48f1481b7 100644 --- a/openspec/changes/ppaf-dynamic-hedging-control/proposal.md +++ b/openspec/changes/ppaf-dynamic-hedging-control/proposal.md @@ -8,7 +8,7 @@ PPAF-enabled Cosmos DB accounts automatically enable hedging with a 1-second thr - When the flag is `true`, hedging is disabled for the PPAF account regardless of any explicit or implicit hedging configuration. - When the flag is `false` or absent, existing hedging behavior is preserved (explicit customer config honored; PPAF defaults applied if no explicit config). - The SDK evaluates the flag dynamically on every account-properties refresh, enabling on-call toggle without customer code changes. -- Non-PPAF accounts ignore the flag. +- The flag is honored for any account where the customer has not opted out via `CosmosClientOptions.DisablePartitionLevelFailover` (surfaced internally as `DisablePartitionLevelFailoverClientLevelOverride`). ## Capabilities @@ -26,3 +26,10 @@ PPAF-enabled Cosmos DB accounts automatically enable hedging with a 1-second thr - **Gateway / service dependency**: the flag is surfaced by the Cosmos DB Gateway; the SDK consumes it read-only. - **No public API surface changes**: the feature is invisible to end users; no new `CosmosClientOptions` or request-options properties are exposed. - **Testing**: unit tests for precedence rules; integration tests validating dynamic toggle via mocked account-property responses. + +## Cross-SDK Parity + +- **.NET (this PR).** Implements the gateway-driven `disableCrossRegionalHedging` flag in the `Microsoft.Azure.Cosmos` v3 SDK. +- **Java.** As of the date of this PR, `azure-sdk-for-java` does not surface this flag. Java already has the GEM `perPartitionAutomaticFailoverConfigModifier` callback plumbing (`GlobalEndpointManager.java`), so mirroring this knob is additive rather than new infrastructure. A tracking issue should be filed against `Azure/azure-sdk-for-java` so an operator flipping the gateway flag gets consistent behavior across .NET and Java clients on the same account. +- **Python.** Out of scope. Python's `azure-cosmos` SDK has no SDK-default PPAF hedging today (default `availability_strategy=False`), so the per-request override path that motivates this knob does not exist. + diff --git a/openspec/changes/ppaf-dynamic-hedging-control/specs/gateway-hedging-override/spec.md b/openspec/changes/ppaf-dynamic-hedging-control/specs/gateway-hedging-override/spec.md index ea102b011e..059f063d7d 100644 --- a/openspec/changes/ppaf-dynamic-hedging-control/specs/gateway-hedging-override/spec.md +++ b/openspec/changes/ppaf-dynamic-hedging-control/specs/gateway-hedging-override/spec.md @@ -91,20 +91,21 @@ The SDK SHALL evaluate the Gateway flag on each account-properties refresh cycle --- -### Requirement: Non-PPAF accounts ignore the flag -The SDK SHALL NOT evaluate or act on the `disableCrossRegionalHedging` flag for accounts that do not have PPAF enabled. +### Requirement: Customer client-level opt-out bypasses the flag +When a customer has set `CosmosClientOptions.DisablePartitionLevelFailover` (surfaced internally as `DisablePartitionLevelFailoverClientLevelOverride`), the SDK SHALL NOT evaluate or act on the `disableCrossRegionalHedging` flag. The customer's local configuration takes precedence over the Gateway-driven operator override. Outside that customer opt-out the flag is honored for both PPAF and non-PPAF accounts. -#### Scenario: Non-PPAF account with flag set to true -- **WHEN** the account does NOT have PPAF enabled (`EnablePartitionLevelFailover` is `false` or absent) +#### Scenario: Customer opt-out with flag set to true +- **WHEN** the customer has set `CosmosClientOptions.DisablePartitionLevelFailover = true` - **AND** the Gateway flag `disableCrossRegionalHedging` is `true` -- **THEN** the SDK SHALL ignore the flag +- **THEN** the SDK SHALL ignore the Gateway flag - **AND** any explicit customer hedging configuration SHALL continue to function normally -#### Scenario: Non-PPAF account with explicit hedging and flag set to true -- **WHEN** the account does NOT have PPAF enabled -- **AND** the customer has configured an explicit `AvailabilityStrategy` +#### Scenario: Non-PPAF account without opt-out with flag set to true +- **WHEN** the account does NOT have PPAF enabled (`EnablePartitionLevelFailover` is `false` or absent) +- **AND** the customer has NOT set `CosmosClientOptions.DisablePartitionLevelFailover` - **AND** the Gateway flag `disableCrossRegionalHedging` is `true` -- **THEN** the SDK SHALL NOT disable the customer's explicit hedging strategy +- **THEN** the SDK SHALL honor the Gateway flag +- **AND** SHALL disable hedging if the customer had configured an explicit `AvailabilityStrategy` --- diff --git a/openspec/changes/ppaf-dynamic-hedging-control/tasks.md b/openspec/changes/ppaf-dynamic-hedging-control/tasks.md index 60c452b896..d7649ee873 100644 --- a/openspec/changes/ppaf-dynamic-hedging-control/tasks.md +++ b/openspec/changes/ppaf-dynamic-hedging-control/tasks.md @@ -16,7 +16,7 @@ ## 4. Request-Level Enforcement -- [ ] 4.1 Update `RequestInvokerHandler` hedging resolution logic to check the Gateway disable flag before evaluating request-level or client-level `AvailabilityStrategy` overrides, ensuring the flag takes absolute precedence when `true` +- [x] 4.1 Update `RequestInvokerHandler` hedging resolution logic to check the Gateway disable flag before evaluating request-level or client-level `AvailabilityStrategy` overrides, ensuring the flag takes absolute precedence when `true` ## 5. Diagnostics and Tracing @@ -27,8 +27,8 @@ - [ ] 6.1 Test: PPAF account with default hedging — flag `true` disables hedging, flag toggled to `false` re-enables default hedging - [ ] 6.2 Test: PPAF account with explicit customer hedging — flag `true` disables hedging, flag toggled to `false` restores customer strategy -- [ ] 6.3 Test: PPAF account with request-level hedging override — flag `true` prevents request-level strategy execution -- [ ] 6.4 Test: Non-PPAF account — flag `true` does not affect explicit customer hedging +- [x] 6.3 Test: PPAF account with request-level hedging override — flag `true` prevents request-level strategy execution +- [ ] 6.4 Test: Account with `DisablePartitionLevelFailoverClientLevelOverride=true` — flag `true` does not affect hedging (verifies customer client-level opt-out gate) - [ ] 6.5 Test: Flag absent from account properties — existing behavior unchanged - [ ] 6.6 Test: Dynamic toggle cycle — enable → disable → re-enable with correct strategy restoration From 2213b5614cbf25c60e4f588474095d12711cfa6b Mon Sep 17 00:00:00 2001 From: Naluwooza Tripician Date: Mon, 18 May 2026 11:42:28 -0700 Subject: [PATCH 5/6] Routing: Fixes unresolved XML doc cref for RequestInvokerHandler.AvailabilityStrategy The cref RequestInvokerHandler.AvailabilityStrategy in DocumentClient.cs could not be resolved because (1) RequestInvokerHandler lives in the Microsoft.Azure.Cosmos.Handlers namespace and is not imported by DocumentClient, and (2) the simple member name 'AvailabilityStrategy' is ambiguous with the public AvailabilityStrategy class. This produced error CS1574 and broke every downstream build job (21 failing CI checks). Qualify the cref with the Handlers namespace and the (RequestMessage) parameter signature so the doc-comment resolver picks the method overload unambiguously. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Microsoft.Azure.Cosmos/src/DocumentClient.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Microsoft.Azure.Cosmos/src/DocumentClient.cs b/Microsoft.Azure.Cosmos/src/DocumentClient.cs index 18e0e2e96d..0c9475176e 100644 --- a/Microsoft.Azure.Cosmos/src/DocumentClient.cs +++ b/Microsoft.Azure.Cosmos/src/DocumentClient.cs @@ -7168,7 +7168,7 @@ private void ApplyHedgingStrategyForCurrentState() /// /// Atomic read of the cached Gateway disableCrossRegionalHedging flag, taken under - /// . Used by + /// . Used by /// to enforce the operator-override precedence over per-request and client-level strategy: /// when the Gateway flag is true, hedging is OFF for every request on this client, /// regardless of where the strategy was configured. From 403d509fa182f0ad7cd4d4cd18307b622066fc0a Mon Sep 17 00:00:00 2001 From: Nalu Tripician Date: Tue, 19 May 2026 11:15:58 -0700 Subject: [PATCH 6/6] Routing: Refactors disableCrossRegionalHedging to volatile for lock-free reads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses two unresolved review threads from @kundadebdatta on PR #5829: 1. IsHedgingDisabledByGateway was taking a monitor lock on every request through RequestInvokerHandler — contradicting its own "atomic kill-switch" contract and adding uncontended-but-non-zero cost to the SDK hot path (CI benchmarks would not surface this since they run steady-state). 2. InitializePartitionLevelFailoverWithDefaultHedging at line 6963 read disableCrossRegionalHedging without any synchronization, even though it executes after the GEM background-refresh loop is live — a narrow but real window where a refresh-driven write could be missed for lack of an acquire barrier. Both findings collapse to the same fix: - Declare disableCrossRegionalHedging as `private volatile bool` so reads have acquire semantics and writes have release semantics. The hedgingStrategyLock continues to serialize the multi-field mutation sequence (flag + stashed customer strategy + ConnectionPolicy.AvailabilityStrategy) on the refresh / init-stash paths. - Drop the lock from IsHedgingDisabledByGateway; collapse it to an expression-body property returning the volatile field directly. - Drop the lock from the DisableCrossRegionalHedgingForTests getter for consistency; keep it on the setter so test writes happens-before any subsequent reconcile-path mutation observes them. - Rewrite the field-level and hedgingStrategyLock comments to spell out the volatile contract explicitly, so future readers do not reintroduce a lock around the hot read. Adds InitializePartitionLevelFailoverWithDefaultHedging_AfterRefreshDrivenFlagFlip_SkipsApplyingDefaultStrategy to GatewayHedgingOverrideTests, which exercises the exact race scenario the reviewer called out: a refresh-driven UpdatePartitionLevelFailoverConfigWithAccountRefresh(true, true) followed by the init-path call to InitializePartitionLevelFailoverWithDefaultHedging, asserting the SDK default is never installed. Complements the existing synthetic-setter coverage in InitializePartitionLevelFailoverWithDefaultHedging_FlagTrue_SkipsApplyingDefaultStrategy. Validation: Microsoft.Azure.Cosmos.Tests Release run — 2750 passed, 0 new failures. All 13 GatewayHedgingOverrideTests pass (12 prior + 1 new). The single failure on this leg (CosmosClientTests.InvalidKey_ExceptionFullStacktrace) is a pre-existing environment-dependent failure unrelated to this change, verified to reproduce on the pre-edit PR head. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Microsoft.Azure.Cosmos/src/DocumentClient.cs | 58 ++++++++++--------- .../GatewayHedgingOverrideTests.cs | 53 +++++++++++++++++ 2 files changed, 85 insertions(+), 26 deletions(-) diff --git a/Microsoft.Azure.Cosmos/src/DocumentClient.cs b/Microsoft.Azure.Cosmos/src/DocumentClient.cs index 0c9475176e..347fc00ed3 100644 --- a/Microsoft.Azure.Cosmos/src/DocumentClient.cs +++ b/Microsoft.Azure.Cosmos/src/DocumentClient.cs @@ -157,9 +157,14 @@ internal partial class DocumentClient : IDisposable, IAuthorizationTokenProvider // caller that wires them up. If a future change ever invokes them off the GEM-serialized // path, the stash/clear/restore sequence stays atomic instead of regressing silently. // - // Per-request reads of ConnectionPolicy.AvailabilityStrategy from RequestInvokerHandler continue - // to rely on atomic reference assignment; they observe either the pre- or post-transition strategy, - // never a half-applied state, and intentionally do not take this lock. + // Per-request reads from RequestInvokerHandler intentionally do NOT take this lock: + // • disableCrossRegionalHedging is declared volatile (acquire semantics on read, release on + // write), which is sufficient for the per-request "is the gateway kill-switch on?" check. + // • ConnectionPolicy.AvailabilityStrategy is a reference field; reference assignment is atomic + // on the CLR, so a request observes either the pre- or post-transition strategy, never a + // half-applied state. + // Skipping the lock keeps the hot path on RequestInvokerHandler.AvailabilityStrategy(...) + // monitor-free. private readonly object hedgingStrategyLock = new object(); internal bool isThinClientEnabled; @@ -202,9 +207,14 @@ internal partial class DocumentClient : IDisposable, IAuthorizationTokenProvider // Gateway-controlled override that disables all cross-regional hedging for PPAF accounts. // Mirrors AccountProperties.DisableCrossRegionalHedging from the most recent account-properties refresh. - // Always read/written under hedgingStrategyLock from the refresh path; per-request reads of the - // mirrored ConnectionPolicy.AvailabilityStrategy rely on atomic reference assignment. - private bool disableCrossRegionalHedging; + // + // Declared volatile so per-request reads from RequestInvokerHandler (via IsHedgingDisabledByGateway) + // and the init-time read in InitializePartitionLevelFailoverWithDefaultHedging do not need to take + // hedgingStrategyLock — volatile gives acquire semantics on the read and release semantics on the + // write, which is exactly what the "atomic kill-switch" contract requires. The lock continues to + // serialize the multi-field (flag + stashed strategy + ConnectionPolicy.AvailabilityStrategy) + // mutation sequence on the refresh / init-stash paths. + private volatile bool disableCrossRegionalHedging; // When the gateway disable flag is true, the customer's explicit AvailabilityStrategy // (if any) is stashed here so it can be restored verbatim if the flag is later toggled back to false. @@ -7167,22 +7177,20 @@ private void ApplyHedgingStrategyForCurrentState() } /// - /// Atomic read of the cached Gateway disableCrossRegionalHedging flag, taken under - /// . Used by - /// to enforce the operator-override precedence over per-request and client-level strategy: + /// Lock-free atomic read of the cached Gateway disableCrossRegionalHedging flag. Used by + /// on every + /// request to enforce the operator-override precedence over per-request and client-level strategy: /// when the Gateway flag is true, hedging is OFF for every request on this client, /// regardless of where the strategy was configured. /// - internal bool IsHedgingDisabledByGateway - { - get - { - lock (this.hedgingStrategyLock) - { - return this.disableCrossRegionalHedging; - } - } - } + /// + /// Safe to read without because the backing field is declared + /// volatile — the read has acquire semantics, so any write that completed on the refresh + /// path before its lock release is visible here. Taking a monitor on every request would impose + /// uncontended-but-non-zero cost on the SDK hot path (steady-state benchmarks would not surface + /// the regression). + /// + internal bool IsHedgingDisabledByGateway => this.disableCrossRegionalHedging; // Test-only accessors. Visible to the unit-test assembly via [InternalsVisibleTo] in // Microsoft.Azure.Cosmos.csproj. @@ -7205,13 +7213,11 @@ internal bool IsHedgingDisabledByGateway // introduced these accessors should be removed in favor of mocking the account snapshot. internal bool DisableCrossRegionalHedgingForTests { - get - { - lock (this.hedgingStrategyLock) - { - return this.disableCrossRegionalHedging; - } - } + // Read is lock-free because disableCrossRegionalHedging is volatile (see field declaration). + // The setter retains the lock so a test write happens-before any subsequent observation on the + // reconcile path that mutates the companion fields (customerConfiguredAvailabilityStrategy, + // ConnectionPolicy.AvailabilityStrategy) under the same lock. + get => this.disableCrossRegionalHedging; set { lock (this.hedgingStrategyLock) diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GatewayHedgingOverrideTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GatewayHedgingOverrideTests.cs index 37f1045843..77f2d1b9e5 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GatewayHedgingOverrideTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GatewayHedgingOverrideTests.cs @@ -93,6 +93,59 @@ public void InitializePartitionLevelFailoverWithDefaultHedging_FlagFalse_Applies } } + /// + /// Regression test for the init-path race called out on PR #5829: in production, + /// is invoked + /// from DocumentClient.cs line 1108 — i.e. after + /// InitializeGatewayConfigurationReaderAsync subscribes the + /// + /// handler and starts the background account-properties refresh loop. A refresh that fires in + /// that narrow window can flip disableCrossRegionalHedging via + /// + /// before the init thread reaches the default-hedging block. Without acquire/release semantics + /// on the flag field, the init-path read could observe a stale value and silently install the + /// SDK default hedging strategy after the operator just disabled hedging. + /// + /// Companion to : + /// that test pins the flag through the synthetic test setter; this one exercises the actual + /// refresh code path that the race scenario depends on. + /// + [TestMethod] + public void InitializePartitionLevelFailoverWithDefaultHedging_AfterRefreshDrivenFlagFlip_SkipsApplyingDefaultStrategy() + { + DocumentClient client = CreateClient(new ConnectionPolicy { EnablePartitionLevelFailover = true }); + try + { + Assert.IsNull(client.ConnectionPolicy.AvailabilityStrategy, "Pre-condition: no strategy configured"); + Assert.IsFalse(client.DisableCrossRegionalHedgingForTests, "Pre-condition: flag starts at default false"); + + // Simulate a refresh-driven flag flip — this is exactly what GEM's background-refresh + // loop calls when AccountProperties.DisableCrossRegionalHedging transitions, and it is + // the write that the init-path read at DocumentClient.cs:6963 races against. + client.UpdatePartitionLevelFailoverConfigWithAccountRefresh( + latestIsEnabled: true, + latestDisableCrossRegionalHedging: true); + + Assert.IsTrue( + client.DisableCrossRegionalHedgingForTests, + "Refresh callback must have published the flag transition"); + + // Now exercise the init-path read. With volatile semantics on the field, the read here + // has acquire semantics and is guaranteed to observe the published value above — and + // therefore must skip applying the SDK default hedging strategy. + client.InitializePartitionLevelFailoverWithDefaultHedging(); + + Assert.IsNull( + client.ConnectionPolicy.AvailabilityStrategy, + "Init-path read of disableCrossRegionalHedging must observe the post-refresh value " + + "and skip applying the SDK default hedging strategy"); + } + finally + { + client.Dispose(); + } + } + [TestMethod] public void UpdateConfig_FlagToggleOn_StashesCustomerStrategyAndClearsAvailabilityStrategy() {