Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,11 @@ private void ValidateGatewayConnection()
{
if (serverErrorResult?.GetServerErrorType() != FaultInjectionServerErrorType.TooManyRequests
&& serverErrorResult?.GetServerErrorType() != FaultInjectionServerErrorType.ResponseDelay
&& serverErrorResult?.GetServerErrorType() != FaultInjectionServerErrorType.SendDelay)
&& serverErrorResult?.GetServerErrorType() != FaultInjectionServerErrorType.SendDelay
&& serverErrorResult?.GetServerErrorType() != FaultInjectionServerErrorType.DatabaseAccountNotFound
&& serverErrorResult?.GetServerErrorType() != FaultInjectionServerErrorType.ServiceUnavailable
&& serverErrorResult?.GetServerErrorType() != FaultInjectionServerErrorType.InternalServerError
&& serverErrorResult?.GetServerErrorType() != FaultInjectionServerErrorType.LeaseNotFound)
{
throw new ArgumentException($"{serverErrorResult?.GetServerErrorType()} is not supported for metadata requests.");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,13 @@ public enum FaultInjectionServerErrorType
ServiceUnavailable,

/// <summary>
/// 404:1008 Database account not found from gateway
/// 403:1008 Database account not found from gateway
/// </summary>
DatabaseAccountNotFound,

/// <summary>
/// 410:1022 Lease not Found
/// </summary>
LeaseNotFound,
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,7 @@ public HttpResponseMessage GetInjectedServerError(DocumentServiceRequest dsr, st

httpResponse.Headers.Add(
WFConstants.BackendHeaders.SubStatus,
((int)SubStatusCodes.RUBudgetExceeded).ToString(CultureInfo.InvariantCulture));
((int)SubStatusCodes.Unknown).ToString(CultureInfo.InvariantCulture));
httpResponse.Headers.Add(WFConstants.BackendHeaders.LocalLSN, lsn);

return httpResponse;
Expand All @@ -470,7 +470,7 @@ public HttpResponseMessage GetInjectedServerError(DocumentServiceRequest dsr, st

httpResponse = new HttpResponseMessage
{
StatusCode = HttpStatusCode.NotFound,
StatusCode = HttpStatusCode.Forbidden,
Content = new FauntInjectionHttpContent(
new MemoryStream(
FaultInjectionResponseEncoding.GetBytes($"Fault Injection Server Error: DatabaseAccountNotFound, rule: {ruleId}"))),
Expand All @@ -488,6 +488,28 @@ public HttpResponseMessage GetInjectedServerError(DocumentServiceRequest dsr, st

return httpResponse;

case FaultInjectionServerErrorType.LeaseNotFound:

httpResponse = new HttpResponseMessage
{
StatusCode = HttpStatusCode.Gone,
Content = new FauntInjectionHttpContent(
new MemoryStream(
FaultInjectionResponseEncoding.GetBytes($"Fault Injection Server Error: LeaseNotFound, rule: {ruleId}"))),
};

foreach (string header in headers.AllKeys())
{
httpResponse.Headers.Add(header, headers.Get(header));
}

httpResponse.Headers.Add(
WFConstants.BackendHeaders.SubStatus,
((int)SubStatusCodes.LeaseNotFound).ToString(CultureInfo.InvariantCulture));
httpResponse.Headers.Add(WFConstants.BackendHeaders.LocalLSN, lsn);

return httpResponse;

default:
throw new ArgumentException($"Server error type {this.serverErrorType} is not supported");
}
Expand Down
15 changes: 12 additions & 3 deletions Microsoft.Azure.Cosmos/src/ClientRetryPolicy.cs
Original file line number Diff line number Diff line change
Expand Up @@ -308,11 +308,12 @@ private async Task<ShouldRetryResult> ShouldRetryInternalAsync(
this.documentServiceRequest?.RequestContext?.LocationEndpointToRoute?.ToString() ?? string.Empty,
this.documentServiceRequest?.ResourceAddress ?? string.Empty);

//Retry policy will retry on the next preffered region as the original requert region is not accepting requests
return await this.ShouldRetryOnEndpointFailureAsync(
isReadRequest: this.isReadRequest,
markBothReadAndWriteAsUnavailable: false,
forceRefresh: false,
retryOnPreferredLocations: false);
retryOnPreferredLocations: true);
}

if (statusCode == HttpStatusCode.NotFound
Expand All @@ -328,6 +329,13 @@ private async Task<ShouldRetryResult> ShouldRetryInternalAsync(
isSystemResourceUnavailableForWrite: false);
}

// Recieved 500 status code or lease not found
if ((statusCode == HttpStatusCode.InternalServerError && this.isReadRequest)
|| (statusCode == HttpStatusCode.Gone && subStatusCode == SubStatusCodes.LeaseNotFound))
{
return this.ShouldRetryOnUnavailableEndpointStatusCodes();
}

return null;
}

Expand Down Expand Up @@ -467,14 +475,15 @@ private ShouldRetryResult TryMarkEndpointUnavailableForPkRangeAndRetryOnServiceU

this.TryMarkEndpointUnavailableForPkRange(isSystemResourceUnavailableForWrite);

return this.ShouldRetryOnServiceUnavailable();
return this.ShouldRetryOnUnavailableEndpointStatusCodes();
}

/// <summary>
/// For a ServiceUnavailable (503.0) we could be having a timeout from Direct/TCP locally or a request to Gateway request with a similar response due to an endpoint not yet available.
/// We try and retry the request only if there are other regions available. The retry logic is applicable for single master write accounts as well.
/// Other status codes include InternalServerError (500.0) and LeaseNotFound (410.1022).
/// </summary>
private ShouldRetryResult ShouldRetryOnServiceUnavailable()
private ShouldRetryResult ShouldRetryOnUnavailableEndpointStatusCodes()
{
if (this.serviceUnavailableRetryCount++ >= ClientRetryPolicy.MaxServiceUnavailableRetryCount)
{
Expand Down
96 changes: 74 additions & 22 deletions Microsoft.Azure.Cosmos/src/MetadataRequestThrottleRetryPolicy.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ internal sealed class MetadataRequestThrottleRetryPolicy : IDocumentClientRetryP
private const int DefaultMaxWaitTimeInSeconds = 60;

/// <summary>
/// A constant integer defining the default maximum retry count on service unavailable.
/// A constant integer defining the default maximum retry count on unavailable endpoint.
/// </summary>
private const int DefaultMaxServiceUnavailableRetryCount = 1;
private const int DefaultMaxUnavailableEndpointRetryCount = 1;

/// <summary>
/// An instance of <see cref="IGlobalEndpointManager"/>.
Expand All @@ -38,9 +38,9 @@ internal sealed class MetadataRequestThrottleRetryPolicy : IDocumentClientRetryP
private readonly IDocumentClientRetryPolicy throttlingRetryPolicy;

/// <summary>
/// An integer defining the maximum retry count on service unavailable.
/// An integer defining the maximum retry count on unavailable endpoint.
/// </summary>
private readonly int maxServiceUnavailableRetryCount;
private readonly int maxUnavailableEndpointRetryCount;

/// <summary>
/// An instance of <see cref="Uri"/> containing the location endpoint where the partition key
Expand All @@ -49,9 +49,9 @@ internal sealed class MetadataRequestThrottleRetryPolicy : IDocumentClientRetryP
private MetadataRetryContext retryContext;

/// <summary>
/// An integer capturing the current retry count on service unavailable.
/// An integer capturing the current retry count on unavailable endpoint.
/// </summary>
private int serviceUnavailableRetryCount;
private int unavailableEndpointRetryCount;

/// <summary>
/// The constructor to initialize an instance of <see cref="MetadataRequestThrottleRetryPolicy"/>.
Expand All @@ -66,8 +66,8 @@ public MetadataRequestThrottleRetryPolicy(
int maxRetryWaitTimeInSeconds = DefaultMaxWaitTimeInSeconds)
{
this.globalEndpointManager = endpointManager;
this.maxServiceUnavailableRetryCount = Math.Max(
MetadataRequestThrottleRetryPolicy.DefaultMaxServiceUnavailableRetryCount,
this.maxUnavailableEndpointRetryCount = Math.Max(
MetadataRequestThrottleRetryPolicy.DefaultMaxUnavailableEndpointRetryCount,
this.globalEndpointManager.PreferredLocationCount);

this.throttlingRetryPolicy = new ResourceThrottleRetryPolicy(
Expand All @@ -91,11 +91,43 @@ public Task<ShouldRetryResult> ShouldRetryAsync(
Exception exception,
CancellationToken cancellationToken)
{
if (exception is CosmosException cosmosException
&& cosmosException.StatusCode == HttpStatusCode.ServiceUnavailable
&& cosmosException.Headers.SubStatusCode == SubStatusCodes.TransportGenerated503)
if (exception is CosmosException cosmosException)
{
if (this.IncrementRetryIndexOnServiceUnavailableForMetadataRead())
return this.ShouldRetryInternalAsync(
cosmosException.StatusCode,
(SubStatusCodes)cosmosException.SubStatusCode,
exception,
cancellationToken);
}

if (exception is DocumentClientException clientException)
{
return this.ShouldRetryInternalAsync(
clientException.StatusCode,
clientException.GetSubStatus(),
exception, cancellationToken);
}

return this.throttlingRetryPolicy.ShouldRetryAsync(exception, cancellationToken);
}

private Task<ShouldRetryResult> ShouldRetryInternalAsync(
HttpStatusCode? statusCode,
SubStatusCodes subStatus,
Exception exception,
CancellationToken cancellationToken)
{
if (statusCode == null)
{
return this.throttlingRetryPolicy.ShouldRetryAsync(exception, cancellationToken);
}

if (statusCode == HttpStatusCode.ServiceUnavailable
|| statusCode == HttpStatusCode.InternalServerError
|| (statusCode == HttpStatusCode.Gone && subStatus == SubStatusCodes.LeaseNotFound)
|| (statusCode == HttpStatusCode.Forbidden && subStatus == SubStatusCodes.DatabaseAccountNotFound))
{
if (this.IncrementRetryIndexOnUnavailableEndpointForMetadataRead())
{
return Task.FromResult(ShouldRetryResult.RetryAfter(TimeSpan.Zero));
}
Expand All @@ -114,16 +146,36 @@ public Task<ShouldRetryResult> ShouldRetryAsync(
ResponseMessage cosmosResponseMessage,
CancellationToken cancellationToken)
{
if (cosmosResponseMessage?.StatusCode == HttpStatusCode.ServiceUnavailable
&& cosmosResponseMessage?.Headers?.SubStatusCode == SubStatusCodes.TransportGenerated503)
return this.ShouldRetryInternalAsync(
cosmosResponseMessage.StatusCode,
(SubStatusCodes)Convert.ToInt32(cosmosResponseMessage.Headers[WFConstants.BackendHeaders.SubStatus]),
cosmosResponseMessage,
cancellationToken);
}

private Task<ShouldRetryResult> ShouldRetryInternalAsync(
HttpStatusCode? statusCode,
SubStatusCodes subStatus,
ResponseMessage responseMessage,
CancellationToken cancellationToken)
{
if (statusCode == null)
{
return this.throttlingRetryPolicy.ShouldRetryAsync(responseMessage, cancellationToken);
}

if (statusCode == HttpStatusCode.ServiceUnavailable
|| statusCode == HttpStatusCode.InternalServerError
|| (statusCode == HttpStatusCode.Gone && subStatus == SubStatusCodes.LeaseNotFound)
|| (statusCode == HttpStatusCode.Forbidden && subStatus == SubStatusCodes.DatabaseAccountNotFound))
{
if (this.IncrementRetryIndexOnServiceUnavailableForMetadataRead())
if (this.IncrementRetryIndexOnUnavailableEndpointForMetadataRead())
{
return Task.FromResult(ShouldRetryResult.RetryAfter(TimeSpan.Zero));
}
}

return this.throttlingRetryPolicy.ShouldRetryAsync(cosmosResponseMessage, cancellationToken);
return this.throttlingRetryPolicy.ShouldRetryAsync(responseMessage, cancellationToken);
}

/// <summary>
Expand All @@ -146,23 +198,23 @@ public void OnBeforeSendRequest(DocumentServiceRequest request)
}

/// <summary>
/// Increments the location index when a service unavailable exception ocurrs, for any future read requests.
/// Increments the location index when a unavailable endpoint exception ocurrs, for any future read requests.
/// </summary>
/// <returns>A boolean flag indicating if the operation was successful.</returns>
private bool IncrementRetryIndexOnServiceUnavailableForMetadataRead()
private bool IncrementRetryIndexOnUnavailableEndpointForMetadataRead()
{
if (this.serviceUnavailableRetryCount++ >= this.maxServiceUnavailableRetryCount)
if (this.unavailableEndpointRetryCount++ >= this.maxUnavailableEndpointRetryCount)
{
DefaultTrace.TraceWarning("MetadataRequestThrottleRetryPolicy: Retry count: {0} has exceeded the maximum permitted retry count on service unavailable: {1}.", this.serviceUnavailableRetryCount, this.maxServiceUnavailableRetryCount);
DefaultTrace.TraceWarning("MetadataRequestThrottleRetryPolicy: Retry count: {0} has exceeded the maximum permitted retry count on unavailable endpoint: {1}.", this.unavailableEndpointRetryCount, this.maxUnavailableEndpointRetryCount);
return false;
}

// Retrying on second PreferredLocations.
// RetryCount is used as zero-based index.
DefaultTrace.TraceWarning("MetadataRequestThrottleRetryPolicy: Incrementing the metadata retry location index to: {0}.", this.serviceUnavailableRetryCount);
DefaultTrace.TraceWarning("MetadataRequestThrottleRetryPolicy: Incrementing the metadata retry location index to: {0}.", this.unavailableEndpointRetryCount);
this.retryContext = new MetadataRetryContext()
{
RetryLocationIndex = this.serviceUnavailableRetryCount,
RetryLocationIndex = this.unavailableEndpointRetryCount,
RetryRequestOnPreferredLocations = true,
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
using System.Text.Json.Serialization;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Azure.Cosmos.Diagnostics;
using Microsoft.Azure.Cosmos.FaultInjection;
using Microsoft.VisualStudio.TestTools.UnitTesting;
using static Microsoft.Azure.Cosmos.Routing.GlobalPartitionEndpointManagerCore;
Expand Down Expand Up @@ -149,6 +150,85 @@ public async Task ReadMany2UnreachablePartitionsTest()
}
}

[TestMethod]
[TestCategory("MultiRegion")]
[DataRow(FaultInjectionServerErrorType.ServiceUnavailable)]
[DataRow(FaultInjectionServerErrorType.InternalServerError)]
[DataRow(FaultInjectionServerErrorType.DatabaseAccountNotFound)]
[DataRow(FaultInjectionServerErrorType.LeaseNotFound)]
public async Task MetadataEndpointUnavailableCrossRegionalRetryTest(FaultInjectionServerErrorType serverErrorType)
{
FaultInjectionRule collReadBad = new FaultInjectionRuleBuilder(
id: "collread",
condition: new FaultInjectionConditionBuilder()
.WithOperationType(FaultInjectionOperationType.MetadataContainer)
.WithRegion(region1)
.Build(),
result: new FaultInjectionServerErrorResultBuilder(serverErrorType)
.Build())
.Build();

FaultInjectionRule pkRangeBad = new FaultInjectionRuleBuilder(
id: "pkrange",
condition: new FaultInjectionConditionBuilder()
.WithOperationType(FaultInjectionOperationType.MetadataPartitionKeyRange)
.WithRegion(region1)
.Build(),
result: new FaultInjectionServerErrorResultBuilder(serverErrorType)
.Build())
.Build();

collReadBad.Disable();
pkRangeBad.Disable();

FaultInjector faultInjector = new FaultInjector(new List<FaultInjectionRule> { pkRangeBad, collReadBad });

CosmosClientOptions cosmosClientOptions = new CosmosClientOptions()
{
ConsistencyLevel = ConsistencyLevel.Session,
ConnectionMode = ConnectionMode.Direct,
Serializer = this.cosmosSystemTextJsonSerializer,
FaultInjector = faultInjector,
ApplicationPreferredRegions = new List<string> { region1, region2, region3 }
};

using (CosmosClient fiClient = new CosmosClient(
connectionString: this.connectionString,
clientOptions: cosmosClientOptions))
{
Database fidb = fiClient.GetDatabase(MultiRegionSetupHelpers.dbName);
Container fic = fidb.GetContainer(MultiRegionSetupHelpers.containerName);

pkRangeBad.Enable();
collReadBad.Enable();

try
{
FeedIterator<CosmosIntegrationTestObject> frTest = fic.GetItemQueryIterator<CosmosIntegrationTestObject>("SELECT * FROM c");
while (frTest.HasMoreResults)
{
FeedResponse<CosmosIntegrationTestObject> feedres = await frTest.ReadNextAsync();

Assert.AreEqual(HttpStatusCode.OK, feedres.StatusCode);
}
}
catch (CosmosException ex)
{
Assert.Fail(ex.Message);
}
finally
{
//Cross regional retry needs to ocur (could trigger for other metadata call to try on secondary region so rule would not trigger)
Assert.IsTrue(pkRangeBad.GetHitCount() + collReadBad.GetHitCount() >= 1);

pkRangeBad.Disable();
collReadBad.Disable();

fiClient.Dispose();
}
}
}

[TestMethod]
[TestCategory("MultiRegion")]
public async Task AddressRefreshTimeoutTest()
Expand Down
Loading
Loading