diff --git a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs index ed0218b49b..50ffa81975 100644 --- a/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs +++ b/Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs @@ -135,6 +135,23 @@ public async Task ShouldRetryAsync( } } + if (exception is OperationCanceledException) + { + DefaultTrace.TraceInformation("ClientRetryPolicy: The operation was cancelled. Not retrying. Retry count = {0}, Endpoint = {1}", + this.failoverRetryCount, + this.locationEndpoint?.ToString() ?? string.Empty); + + if (this.partitionKeyRangeLocationCache.IncrementRequestFailureCounterAndCheckIfPartitionCanFailover( + this.documentServiceRequest)) + { + // In the event of a (ppaf + write operation) or (ppcb + read or multi-master write operation) getting timed + // out due to cancellation token expiration on region A, mark the partition as unavailable assuming that + // the partition has been failed over to region B, when per partition automatic failover is enabled. + this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange( + this.documentServiceRequest); + } + } + return await this.throttlingRetry.ShouldRetryAsync(exception, cancellationToken); } diff --git a/Microsoft.Azure.Cosmos/src/Handler/AbstractRetryHandler.cs b/Microsoft.Azure.Cosmos/src/Handler/AbstractRetryHandler.cs index 819064c958..cd8796074c 100644 --- a/Microsoft.Azure.Cosmos/src/Handler/AbstractRetryHandler.cs +++ b/Microsoft.Azure.Cosmos/src/Handler/AbstractRetryHandler.cs @@ -67,11 +67,10 @@ private static async Task ExecuteHttpRequestAsync( { while (true) { - cancellationToken.ThrowIfCancellationRequested(); ShouldRetryResult result; - try { + cancellationToken.ThrowIfCancellationRequested(); ResponseMessage cosmosResponseMessage = await callbackMethod(); if (cosmosResponseMessage.IsSuccessStatusCode) { @@ -94,6 +93,14 @@ private static async Task ExecuteHttpRequestAsync( throw; } } + catch (OperationCanceledException oce) + { + result = await callShouldRetryException(oce, cancellationToken); + if (!result.ShouldRetry) + { + throw; + } + } TimeSpan backoffTime = result.BackoffTime; if (backoffTime != TimeSpan.Zero) diff --git a/Microsoft.Azure.Cosmos/src/Routing/GlobalPartitionEndpointManagerCore.cs b/Microsoft.Azure.Cosmos/src/Routing/GlobalPartitionEndpointManagerCore.cs index 8798218551..876f402edc 100644 --- a/Microsoft.Azure.Cosmos/src/Routing/GlobalPartitionEndpointManagerCore.cs +++ b/Microsoft.Azure.Cosmos/src/Routing/GlobalPartitionEndpointManagerCore.cs @@ -213,11 +213,23 @@ public override bool IncrementRequestFailureCounterAndCheckIfPartitionCanFailove return false; } - PartitionKeyRangeFailoverInfo partionFailover = this.PartitionKeyRangeToLocationForReadAndWrite.Value.GetOrAdd( - partitionKeyRange, - (_) => new PartitionKeyRangeFailoverInfo( - request.RequestContext.ResolvedCollectionRid, - failedLocation)); + PartitionKeyRangeFailoverInfo partionFailover; + if (this.IsRequestEligibleForPerPartitionAutomaticFailover(request)) + { + partionFailover = this.PartitionKeyRangeToLocationForWrite.Value.GetOrAdd( + partitionKeyRange, + (_) => new PartitionKeyRangeFailoverInfo( + request.RequestContext.ResolvedCollectionRid, + failedLocation)); + } + else + { + partionFailover = this.PartitionKeyRangeToLocationForReadAndWrite.Value.GetOrAdd( + partitionKeyRange, + (_) => new PartitionKeyRangeFailoverInfo( + request.RequestContext.ResolvedCollectionRid, + failedLocation)); + } partionFailover.IncrementRequestFailureCounts( isReadOnlyRequest: request.IsReadOnlyRequest, diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs index 460874c1d2..ae24e76449 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ClientRetryPolicyTests.cs @@ -243,7 +243,8 @@ public void HttpRequestExceptionHandelingTests( GlobalPartitionEndpointManagerCore.PartitionKeyRangeFailoverInfo partitionKeyRangeFailoverInfo = ClientRetryPolicyTests.GetPartitionKeyRangeFailoverInfoUsingReflection( this.partitionKeyRangeLocationCache, - request.RequestContext.ResolvedPartitionKeyRange); + request.RequestContext.ResolvedPartitionKeyRange, + isReadOnlyOrMultiMasterWriteRequest: false); // Validate that the partition key range failover info is not present before the http request exception was captured in the retry policy. Assert.IsNull(partitionKeyRangeFailoverInfo); @@ -255,7 +256,8 @@ public void HttpRequestExceptionHandelingTests( partitionKeyRangeFailoverInfo = ClientRetryPolicyTests.GetPartitionKeyRangeFailoverInfoUsingReflection( this.partitionKeyRangeLocationCache, - request.RequestContext.ResolvedPartitionKeyRange); + request.RequestContext.ResolvedPartitionKeyRange, + isReadOnlyOrMultiMasterWriteRequest: false); if (enablePartitionLevelFailover) { @@ -266,6 +268,86 @@ public void HttpRequestExceptionHandelingTests( { Assert.IsNull(partitionKeyRangeFailoverInfo); } + } + + /// + /// Test to validate that when an OperationCanceledException is thrown during the retry attempt, for a single master write account with PPAF enabled, + /// a partition level failover is applied and the subsequent requests will be retried on the next region for the faulty partition. + /// + [TestMethod] + [DataRow(true, true, DisplayName = "Read Request - Case when partition level failover is enabled.")] + [DataRow(false, true, DisplayName = "Write Request - Case when partition level failover is enabled.")] + [DataRow(true, false, DisplayName = "Read Request - Case when partition level failover is disabled.")] + [DataRow(false, false, DisplayName = "Write Request - Case when partition level failover is disabled.")] + public void CosmosOperationCancelledExceptionHandelingTests( + bool isReadOnlyRequest, + bool enablePartitionLevelFailover) + { + int requestThreshold = isReadOnlyRequest ? 10 : 5; + const bool enableEndpointDiscovery = true; + const string suffix = "-FF-FF-FF-FF-FF-FF-FF-FF-FF-FF-FF-FF-FF-FF-FF"; + + //Creates a sample write request + DocumentServiceRequest request = this.CreateRequest(isReadOnlyRequest, false); + request.RequestContext.ResolvedPartitionKeyRange = new PartitionKeyRange() { Id = "0", MinInclusive = "3F" + suffix, MaxExclusive = "5F" + suffix }; + + //Create GlobalEndpointManager + using GlobalEndpointManager endpointManager = this.Initialize( + useMultipleWriteLocations: false, + enableEndpointDiscovery: enableEndpointDiscovery, + isPreferredLocationsListEmpty: false, + enablePartitionLevelFailover: enablePartitionLevelFailover); + + // Capture the read locations. + ReadOnlyCollection readLocations = endpointManager.ReadEndpoints; + + //Create Retry Policy + ClientRetryPolicy retryPolicy = new( + globalEndpointManager: endpointManager, + partitionKeyRangeLocationCache: this.partitionKeyRangeLocationCache, + retryOptions: new RetryOptions(), + enableEndpointDiscovery: enableEndpointDiscovery, + isPartitionLevelFailoverEnabled: enablePartitionLevelFailover); + + CancellationToken cancellationToken = new(); + OperationCanceledException operationCancelledException = new(message: "Operation was cancelled due to cancellation token expiry."); + + GlobalPartitionEndpointManagerCore.PartitionKeyRangeFailoverInfo partitionKeyRangeFailoverInfo = ClientRetryPolicyTests.GetPartitionKeyRangeFailoverInfoUsingReflection( + this.partitionKeyRangeLocationCache, + request.RequestContext.ResolvedPartitionKeyRange, + isReadOnlyOrMultiMasterWriteRequest: isReadOnlyRequest); + + // Validate that the partition key range failover info is not present before the http request exception was captured in the retry policy. + Assert.IsNull(partitionKeyRangeFailoverInfo); + + Task retryStatus; + + // With cancellation token expiry, the retry policy should not failover the offending partition + // until the write threshold is met. + for (int i=0; i< requestThreshold; i++) + { + retryPolicy.OnBeforeSendRequest(request); + retryStatus = retryPolicy.ShouldRetryAsync(operationCancelledException, cancellationToken); + } + + retryStatus = retryPolicy.ShouldRetryAsync(operationCancelledException, cancellationToken); + Assert.IsFalse(retryStatus.Result.ShouldRetry); + + partitionKeyRangeFailoverInfo = ClientRetryPolicyTests.GetPartitionKeyRangeFailoverInfoUsingReflection( + this.partitionKeyRangeLocationCache, + request.RequestContext.ResolvedPartitionKeyRange, + isReadOnlyOrMultiMasterWriteRequest: isReadOnlyRequest); + + if (enablePartitionLevelFailover) + { + // Validate that the partition key range failover info to the next account region is present after the http request exception was captured in the retry policy. + Assert.IsNotNull(partitionKeyRangeFailoverInfo); + Assert.AreEqual(partitionKeyRangeFailoverInfo.Current, readLocations[1]); + } + else + { + Assert.IsNull(partitionKeyRangeFailoverInfo); + } } [TestMethod] @@ -433,12 +515,14 @@ await BackoffRetryUtility.ExecuteAsync( private static GlobalPartitionEndpointManagerCore.PartitionKeyRangeFailoverInfo GetPartitionKeyRangeFailoverInfoUsingReflection( GlobalPartitionEndpointManager globalPartitionEndpointManager, - PartitionKeyRange pkRange) + PartitionKeyRange pkRange, + bool isReadOnlyOrMultiMasterWriteRequest) { + string fieldName = isReadOnlyOrMultiMasterWriteRequest ? "PartitionKeyRangeToLocationForReadAndWrite" : "PartitionKeyRangeToLocationForWrite"; FieldInfo fieldInfo = globalPartitionEndpointManager .GetType() .GetField( - name: "PartitionKeyRangeToLocationForWrite", + name: fieldName, bindingAttr: BindingFlags.Instance | BindingFlags.NonPublic); if (fieldInfo != null) @@ -494,6 +578,7 @@ private GlobalEndpointManager Initialize( bool enforceSingleMasterSingleWriteLocation = false, // Some tests depend on the Initialize to create an account with multiple write locations, even when not multi master ReadOnlyCollection preferedRegionListOverride = null, bool enablePartitionLevelFailover = false, + bool enablePartitionLevelCircuitBreaker = false, bool multimasterMetadataWriteRetryTest = false) { this.databaseAccount = ClientRetryPolicyTests.CreateDatabaseAccount( @@ -545,7 +630,8 @@ private GlobalEndpointManager Initialize( { this.partitionKeyRangeLocationCache = new GlobalPartitionEndpointManagerCore( globalEndpointManager: endpointManager, - isPartitionLevelFailoverEnabled: enablePartitionLevelFailover); + isPartitionLevelFailoverEnabled: enablePartitionLevelFailover, + isPartitionLevelCircuitBreakerEnabled: enablePartitionLevelFailover || enablePartitionLevelCircuitBreaker); } else {