Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,23 @@ public async Task<ShouldRetryResult> ShouldRetryAsync(
}
}

if (exception is OperationCanceledException)
{
DefaultTrace.TraceInformation("ClientRetryPolicy: The operation was cancelled. Not retrying. Retry count = {0}, Endpoint = {1}",
this.failoverRetryCount,
this.locationEndpoint?.ToString() ?? string.Empty);

if (this.partitionKeyRangeLocationCache.IncrementRequestFailureCounterAndCheckIfPartitionCanFailover(
Comment thread
kundadebdatta marked this conversation as resolved.
this.documentServiceRequest))
{
// In the event of a (ppaf + write operation) or (ppcb + read or multi-master write operation) getting timed
// out due to cancellation token expiration on region A, mark the partition as unavailable assuming that
// the partition has been failed over to region B, when per partition automatic failover is enabled.
this.partitionKeyRangeLocationCache.TryMarkEndpointUnavailableForPartitionKeyRange(
this.documentServiceRequest);
}
}

return await this.throttlingRetry.ShouldRetryAsync(exception, cancellationToken);
}

Expand Down
11 changes: 9 additions & 2 deletions Microsoft.Azure.Cosmos/src/Handler/AbstractRetryHandler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,10 @@ private static async Task<ResponseMessage> ExecuteHttpRequestAsync(
{
while (true)
{
cancellationToken.ThrowIfCancellationRequested();
ShouldRetryResult result;

try
{
cancellationToken.ThrowIfCancellationRequested();
ResponseMessage cosmosResponseMessage = await callbackMethod();
if (cosmosResponseMessage.IsSuccessStatusCode)
{
Expand All @@ -94,6 +93,14 @@ private static async Task<ResponseMessage> ExecuteHttpRequestAsync(
throw;
}
}
catch (OperationCanceledException oce)
{
result = await callShouldRetryException(oce, cancellationToken);
if (!result.ShouldRetry)
{
throw;
}
}

TimeSpan backoffTime = result.BackoffTime;
if (backoffTime != TimeSpan.Zero)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -213,11 +213,23 @@ public override bool IncrementRequestFailureCounterAndCheckIfPartitionCanFailove
return false;
}

PartitionKeyRangeFailoverInfo partionFailover = this.PartitionKeyRangeToLocationForReadAndWrite.Value.GetOrAdd(
partitionKeyRange,
(_) => new PartitionKeyRangeFailoverInfo(
request.RequestContext.ResolvedCollectionRid,
failedLocation));
PartitionKeyRangeFailoverInfo partionFailover;
if (this.IsRequestEligibleForPerPartitionAutomaticFailover(request))
{
partionFailover = this.PartitionKeyRangeToLocationForWrite.Value.GetOrAdd(
partitionKeyRange,
(_) => new PartitionKeyRangeFailoverInfo(
request.RequestContext.ResolvedCollectionRid,
failedLocation));
}
else
{
partionFailover = this.PartitionKeyRangeToLocationForReadAndWrite.Value.GetOrAdd(
partitionKeyRange,
(_) => new PartitionKeyRangeFailoverInfo(
request.RequestContext.ResolvedCollectionRid,
failedLocation));
}

partionFailover.IncrementRequestFailureCounts(
isReadOnlyRequest: request.IsReadOnlyRequest,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,8 @@ public void HttpRequestExceptionHandelingTests(

GlobalPartitionEndpointManagerCore.PartitionKeyRangeFailoverInfo partitionKeyRangeFailoverInfo = ClientRetryPolicyTests.GetPartitionKeyRangeFailoverInfoUsingReflection(
this.partitionKeyRangeLocationCache,
request.RequestContext.ResolvedPartitionKeyRange);
request.RequestContext.ResolvedPartitionKeyRange,
isReadOnlyOrMultiMasterWriteRequest: false);

// Validate that the partition key range failover info is not present before the http request exception was captured in the retry policy.
Assert.IsNull(partitionKeyRangeFailoverInfo);
Expand All @@ -255,7 +256,8 @@ public void HttpRequestExceptionHandelingTests(

partitionKeyRangeFailoverInfo = ClientRetryPolicyTests.GetPartitionKeyRangeFailoverInfoUsingReflection(
this.partitionKeyRangeLocationCache,
request.RequestContext.ResolvedPartitionKeyRange);
request.RequestContext.ResolvedPartitionKeyRange,
isReadOnlyOrMultiMasterWriteRequest: false);

if (enablePartitionLevelFailover)
{
Expand All @@ -266,6 +268,86 @@ public void HttpRequestExceptionHandelingTests(
{
Assert.IsNull(partitionKeyRangeFailoverInfo);
}
}

/// <summary>
/// Test to validate that when an OperationCanceledException is thrown during the retry attempt, for a single master write account with PPAF enabled,
/// a partition level failover is applied and the subsequent requests will be retried on the next region for the faulty partition.
/// </summary>
[TestMethod]
[DataRow(true, true, DisplayName = "Read Request - Case when partition level failover is enabled.")]
[DataRow(false, true, DisplayName = "Write Request - Case when partition level failover is enabled.")]
[DataRow(true, false, DisplayName = "Read Request - Case when partition level failover is disabled.")]
[DataRow(false, false, DisplayName = "Write Request - Case when partition level failover is disabled.")]
public void CosmosOperationCancelledExceptionHandelingTests(
bool isReadOnlyRequest,
bool enablePartitionLevelFailover)
{
int requestThreshold = isReadOnlyRequest ? 10 : 5;
const bool enableEndpointDiscovery = true;
const string suffix = "-FF-FF-FF-FF-FF-FF-FF-FF-FF-FF-FF-FF-FF-FF-FF";

//Creates a sample write request
DocumentServiceRequest request = this.CreateRequest(isReadOnlyRequest, false);
request.RequestContext.ResolvedPartitionKeyRange = new PartitionKeyRange() { Id = "0", MinInclusive = "3F" + suffix, MaxExclusive = "5F" + suffix };

//Create GlobalEndpointManager
using GlobalEndpointManager endpointManager = this.Initialize(
useMultipleWriteLocations: false,
enableEndpointDiscovery: enableEndpointDiscovery,
isPreferredLocationsListEmpty: false,
enablePartitionLevelFailover: enablePartitionLevelFailover);

// Capture the read locations.
ReadOnlyCollection<Uri> readLocations = endpointManager.ReadEndpoints;

//Create Retry Policy
ClientRetryPolicy retryPolicy = new(
globalEndpointManager: endpointManager,
partitionKeyRangeLocationCache: this.partitionKeyRangeLocationCache,
retryOptions: new RetryOptions(),
enableEndpointDiscovery: enableEndpointDiscovery,
isPartitionLevelFailoverEnabled: enablePartitionLevelFailover);

CancellationToken cancellationToken = new();
OperationCanceledException operationCancelledException = new(message: "Operation was cancelled due to cancellation token expiry.");

GlobalPartitionEndpointManagerCore.PartitionKeyRangeFailoverInfo partitionKeyRangeFailoverInfo = ClientRetryPolicyTests.GetPartitionKeyRangeFailoverInfoUsingReflection(
this.partitionKeyRangeLocationCache,
request.RequestContext.ResolvedPartitionKeyRange,
isReadOnlyOrMultiMasterWriteRequest: isReadOnlyRequest);

// Validate that the partition key range failover info is not present before the http request exception was captured in the retry policy.
Assert.IsNull(partitionKeyRangeFailoverInfo);

Task<ShouldRetryResult> retryStatus;

// With cancellation token expiry, the retry policy should not failover the offending partition
// until the write threshold is met.
for (int i=0; i< requestThreshold; i++)
{
retryPolicy.OnBeforeSendRequest(request);
retryStatus = retryPolicy.ShouldRetryAsync(operationCancelledException, cancellationToken);
}

retryStatus = retryPolicy.ShouldRetryAsync(operationCancelledException, cancellationToken);
Assert.IsFalse(retryStatus.Result.ShouldRetry);

partitionKeyRangeFailoverInfo = ClientRetryPolicyTests.GetPartitionKeyRangeFailoverInfoUsingReflection(
this.partitionKeyRangeLocationCache,
request.RequestContext.ResolvedPartitionKeyRange,
isReadOnlyOrMultiMasterWriteRequest: isReadOnlyRequest);

if (enablePartitionLevelFailover)
{
// Validate that the partition key range failover info to the next account region is present after the http request exception was captured in the retry policy.
Assert.IsNotNull(partitionKeyRangeFailoverInfo);
Assert.AreEqual(partitionKeyRangeFailoverInfo.Current, readLocations[1]);
}
else
{
Assert.IsNull(partitionKeyRangeFailoverInfo);
}
}

[TestMethod]
Expand Down Expand Up @@ -433,12 +515,14 @@ await BackoffRetryUtility<StoreResponse>.ExecuteAsync(

private static GlobalPartitionEndpointManagerCore.PartitionKeyRangeFailoverInfo GetPartitionKeyRangeFailoverInfoUsingReflection(
GlobalPartitionEndpointManager globalPartitionEndpointManager,
PartitionKeyRange pkRange)
PartitionKeyRange pkRange,
bool isReadOnlyOrMultiMasterWriteRequest)
{
string fieldName = isReadOnlyOrMultiMasterWriteRequest ? "PartitionKeyRangeToLocationForReadAndWrite" : "PartitionKeyRangeToLocationForWrite";
FieldInfo fieldInfo = globalPartitionEndpointManager
.GetType()
.GetField(
name: "PartitionKeyRangeToLocationForWrite",
name: fieldName,
bindingAttr: BindingFlags.Instance | BindingFlags.NonPublic);

if (fieldInfo != null)
Expand Down Expand Up @@ -494,6 +578,7 @@ private GlobalEndpointManager Initialize(
bool enforceSingleMasterSingleWriteLocation = false, // Some tests depend on the Initialize to create an account with multiple write locations, even when not multi master
ReadOnlyCollection<string> preferedRegionListOverride = null,
bool enablePartitionLevelFailover = false,
bool enablePartitionLevelCircuitBreaker = false,
bool multimasterMetadataWriteRetryTest = false)
{
this.databaseAccount = ClientRetryPolicyTests.CreateDatabaseAccount(
Expand Down Expand Up @@ -545,7 +630,8 @@ private GlobalEndpointManager Initialize(
{
this.partitionKeyRangeLocationCache = new GlobalPartitionEndpointManagerCore(
globalEndpointManager: endpointManager,
isPartitionLevelFailoverEnabled: enablePartitionLevelFailover);
isPartitionLevelFailoverEnabled: enablePartitionLevelFailover,
isPartitionLevelCircuitBreakerEnabled: enablePartitionLevelFailover || enablePartitionLevelCircuitBreaker);
}
else
{
Expand Down