Skip to content

Commit

Permalink
[Internal] Upgrade Resiliency: Adds Replica Health State Diagnostics. (
Browse files Browse the repository at this point in the history
…#3835)

* Code changes to add replica health status in diagnostics.

* Code changes to fix performance test build failure.

* Code changes to add health state capture logic in address cache.

* Code changes to fix benchmark test execution.

* Code changes to add tests to validate health state cache.

* Code changes to reduce default request timeout to 5 seconds.

* Revert "Code changes to reduce default request timeout to 5 seconds."

This reverts commit 139f37e588fc9dfed608431f4186c567a080e622.
  • Loading branch information
kundadebdatta authored Jun 10, 2023
1 parent 28e03c8 commit b924209
Show file tree
Hide file tree
Showing 6 changed files with 115 additions and 5 deletions.
23 changes: 23 additions & 0 deletions Microsoft.Azure.Cosmos/src/Routing/GatewayAddressCache.cs
Original file line number Diff line number Diff line change
Expand Up @@ -550,10 +550,17 @@ await this.GetServerAddressesViaGatewayAsync(request, collectionRid, new[] { par
}

this.ValidateReplicaAddresses(transportAddressUris);
this.CaptureTransportAddressUriHealthStates(
partitionAddressInformation: mergedAddresses,
transportAddressUris: transportAddressUris);

return mergedAddresses;
}

this.CaptureTransportAddressUriHealthStates(
partitionAddressInformation: result.Item2,
transportAddressUris: result.Item2.Get(Protocol.Tcp)?.ReplicaTransportAddressUris);

return result.Item2;
}
}
Expand Down Expand Up @@ -936,6 +943,22 @@ TransportAddressHealthState.HealthStatus.Unknown or
TransportAddressHealthState.HealthStatus.UnhealthyPending);
}

/// <summary>
/// The replica health status of the transport address uri will change eventually with the motonically increasing time.
/// However, the purpose of this method is to capture the health status snapshot at this moment.
/// </summary>
/// <param name="partitionAddressInformation">An instance of <see cref="PartitionAddressInformation"/>.</param>
/// <param name="transportAddressUris">A read-only list of <see cref="TransportAddressUri"/>.</param>
private void CaptureTransportAddressUriHealthStates(
PartitionAddressInformation partitionAddressInformation,
IReadOnlyList<TransportAddressUri> transportAddressUris)
{
partitionAddressInformation
.Get(Protocol.Tcp)?
.SetTransportAddressUrisHealthState(
replicaHealthStates: transportAddressUris.Select(x => x.GetCurrentHealthState().GetHealthStatusDiagnosticString()).ToList());
}

protected virtual void Dispose(bool disposing)
{
if (this.disposedValue)
Expand Down
18 changes: 18 additions & 0 deletions Microsoft.Azure.Cosmos/src/Tracing/TraceWriter.TraceJsonWriter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,8 @@ public void Visit(StoreResult storeResult)
this.jsonWriter.WriteFieldName("BELatencyInMs");
this.WriteStringValueOrNull(storeResult.BackendRequestDurationInMs);

this.WriteJsonUriArray("ReplicaHealthStatuses", storeResult.ReplicaHealthStatuses);

this.VisitTransportRequestStats(storeResult.TransportRequestStats);

this.jsonWriter.WriteFieldName("TransportException");
Expand Down Expand Up @@ -468,6 +470,22 @@ private void WriteJsonUriArray(string propertyName, IEnumerable<TransportAddress
this.jsonWriter.WriteArrayEnd();
}

private void WriteJsonUriArray(string propertyName, IEnumerable<string> replicaHealthStatuses)
{
this.jsonWriter.WriteFieldName(propertyName);
this.jsonWriter.WriteArrayStart();

if (replicaHealthStatuses != null)
{
foreach (string replicaHealthStatus in replicaHealthStatuses)
{
this.WriteStringValueOrNull(replicaHealthStatus);
}
}

this.jsonWriter.WriteArrayEnd();
}

private void WriteRegionsContactedArray(string propertyName, IEnumerable<(string, Uri)> uris)
{
this.jsonWriter.WriteFieldName(propertyName);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
"MockedItemBenchmark.DeleteItemNotExists;[Type=OfT]": 42172.5,
"MockedItemBenchmark.DeleteItemNotExists;[Type=OfTCustom]": 42174.75,
"MockedItemBenchmark.DeleteItemNotExists;[Type=OfTWithClientTelemetryEnabled]": 42166.5,
"MockedItemBenchmark.DeleteItemNotExists;[Type=OfTWithDiagnosticsToString]": 56695.5,
"MockedItemBenchmark.DeleteItemNotExists;[Type=OfTWithDiagnosticsToString]": 63338,
"MockedItemBenchmark.DeleteItemNotExists;[Type=Stream]": 37610,
"MockedItemBenchmark.QuerySinglePartitionMultiplePages;[Type=OfT]": 13342232,
"MockedItemBenchmark.QuerySinglePartitionMultiplePages;[Type=OfTCustom]": 13341058,
Expand All @@ -34,12 +34,12 @@
"MockedItemBenchmark.ReadItemExists;[Type=OfT]": 33630.5,
"MockedItemBenchmark.ReadItemExists;[Type=OfTCustom]": 33636.25,
"MockedItemBenchmark.ReadItemExists;[Type=OfTWithClientTelemetryEnabled]": 33627.75,
"MockedItemBenchmark.ReadItemExists;[Type=OfTWithDiagnosticsToString]": 47961.25,
"MockedItemBenchmark.ReadItemExists;[Type=OfTWithDiagnosticsToString]": 55044,
"MockedItemBenchmark.ReadItemExists;[Type=Stream]": 26018.25,
"MockedItemBenchmark.ReadItemNotExists;[Type=OfT]": 43489.25,
"MockedItemBenchmark.ReadItemNotExists;[Type=OfTCustom]": 43490,
"MockedItemBenchmark.ReadItemNotExists;[Type=OfTWithClientTelemetryEnabled]": 43489.25,
"MockedItemBenchmark.ReadItemNotExists;[Type=OfTWithDiagnosticsToString]": 57420.25,
"MockedItemBenchmark.ReadItemNotExists;[Type=OfTWithDiagnosticsToString]": 58054,
"MockedItemBenchmark.ReadItemNotExists;[Type=Stream]": 39044,
"MockedItemBenchmark.UpdateItem;[Type=OfT]": 36591,
"MockedItemBenchmark.UpdateItem;[Type=OfTCustom]": 36594.25,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,12 @@
"RequestCharge": 3.14,
"RetryAfterInMs": "9000",
"BELatencyInMs": "4.2",
"ReplicaHealthStatuses": [
"http://storephysicaladdress-1p.com:Connected",
"http://storephysicaladdress-2s.com:Unknown",
"http://storephysicaladdress-3s.com:Unhealthy",
"http://storephysicaladdress-4s.com:Unknown"
],
"transportRequestTimeline": {
"requestTimeline": [
{
Expand Down Expand Up @@ -529,6 +535,7 @@
"RequestCharge": 0,
"RetryAfterInMs": null,
"BELatencyInMs": null,
"ReplicaHealthStatuses":[],
"transportRequestTimeline": null,
"TransportException": null
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -971,6 +971,13 @@ public async Task TryGetAddressesAsync_WhenReplicaVlidationEnabled_ShouldValidat
expected: TransportAddressHealthState.HealthStatus.Connected,
actual: refreshedUri.GetCurrentHealthState().GetHealthStatus());

GatewayAddressCacheTests.ValidateHealthStatesInDiagnostics(
addressInfo: addressInfo,
numberOfConnectedReplicas: 0,
numberOfUnknownReplicas: 4,
numberOfUnhealthyPendingReplicas: 0,
numberOfUnhealthyReplicas: 0);

Assert.AreEqual(4, addressInfo.AllAddresses.Count);
Assert.AreEqual(1, addressInfo.AllAddresses.Count(x => x.PhysicalUri == oldAddress));
Assert.AreEqual(0, addressInfo.AllAddresses.Count(x => x.PhysicalUri == newAddress));
Expand Down Expand Up @@ -1008,6 +1015,14 @@ public async Task TryGetAddressesAsync_WhenReplicaVlidationEnabled_ShouldValidat
actual: refreshedUri.GetCurrentHealthState().GetHealthStatus());

mockHttpHandler.VerifyAll();

GatewayAddressCacheTests.ValidateHealthStatesInDiagnostics(
addressInfo: addressInfo,
numberOfConnectedReplicas: 2,
numberOfUnknownReplicas: 1,
numberOfUnhealthyPendingReplicas: 1,
numberOfUnhealthyReplicas: 0);

GatewayAddressCacheTests.AssertOpenConnectionHandlerAttributes(
fakeOpenConnectionHandler: fakeOpenConnectionHandler,
expectedTotalFailedAddressesToOpenCount: 0,
Expand Down Expand Up @@ -1106,6 +1121,13 @@ public async Task TryGetAddressesAsync_WhenReplicaVlidationEnabledAndUnhealthyUr
manualResetEvent: manualResetEvent,
shouldReset: true);

GatewayAddressCacheTests.ValidateHealthStatesInDiagnostics(
addressInfo: addressInfo,
numberOfConnectedReplicas: 0,
numberOfUnknownReplicas: 4,
numberOfUnhealthyPendingReplicas: 0,
numberOfUnhealthyReplicas: 0);

GatewayAddressCacheTests.AssertOpenConnectionHandlerAttributes(
fakeOpenConnectionHandler: fakeOpenConnectionHandler,
expectedTotalFailedAddressesToOpenCount: 0,
Expand Down Expand Up @@ -1157,6 +1179,13 @@ public async Task TryGetAddressesAsync_WhenReplicaVlidationEnabledAndUnhealthyUr
expected: TransportAddressHealthState.HealthStatus.Unhealthy,
actual: refreshedUri.GetCurrentHealthState().GetHealthStatus());

GatewayAddressCacheTests.ValidateHealthStatesInDiagnostics(
addressInfo: addressInfo,
numberOfConnectedReplicas: 2,
numberOfUnknownReplicas: 1,
numberOfUnhealthyPendingReplicas: 1,
numberOfUnhealthyReplicas: 0);

GatewayAddressCacheTests.AssertOpenConnectionHandlerAttributes(
fakeOpenConnectionHandler: fakeOpenConnectionHandler,
expectedTotalFailedAddressesToOpenCount: 1,
Expand Down Expand Up @@ -1356,6 +1385,41 @@ private static void AssertOpenConnectionHandlerAttributes(
Assert.AreEqual(expectedTotalSuccessAddressesToOpenCount, fakeOpenConnectionHandler.GetTotalSuccessfulInvocationCount());
}

/// <summary>
/// Helper method to validate and assert on the cached health states for diagnostics.
/// </summary>
/// <param name="addressInfo">An instance of <see cref="PartitionAddressInformation"/> containing the partition address information.</param>
/// <param name="numberOfConnectedReplicas">An integer containing the number of connected replicas to be validated inthe cached health status list.</param>
/// <param name="numberOfUnknownReplicas">An integer containing the number of unknown replicas to be validated inthe cached health status list.</param>
/// <param name="numberOfUnhealthyPendingReplicas">An integer containing the number of unhealthy pending replicas to be validated inthe cached health status list.</param>
/// <param name="numberOfUnhealthyReplicas">An integer containing the number of unhealthy replicas to be validated inthe cached health status list.</param>
private static void ValidateHealthStatesInDiagnostics(
PartitionAddressInformation addressInfo,
int numberOfConnectedReplicas,
int numberOfUnknownReplicas,
int numberOfUnhealthyPendingReplicas,
int numberOfUnhealthyReplicas)
{
IReadOnlyList<string> replicaHealthStatuses = addressInfo.Get(Protocol.Tcp)?.ReplicaTransportAddressUrisHealthState;

Assert.IsNotNull(replicaHealthStatuses);
Assert.AreEqual(
expected: numberOfConnectedReplicas,
actual: replicaHealthStatuses.Where(x => x.Contains("| status: Connected |")).Count());

Assert.AreEqual(
expected: numberOfUnknownReplicas,
actual: replicaHealthStatuses.Where(x => x.Contains("| status: Unknown |")).Count());

Assert.AreEqual(
expected: numberOfUnhealthyPendingReplicas,
actual: replicaHealthStatuses.Where(x => x.Contains("| status: UnhealthyPending |")).Count());

Assert.AreEqual(
expected: numberOfUnhealthyReplicas,
actual: replicaHealthStatuses.Where(x => x.Contains("| status: Unhealthy |")).Count());
}

private class FakeMessageHandler : HttpMessageHandler
{
private bool returnFullReplicaSet;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,6 @@ public void ValidateStoreResultSerialization()
storeResultProperties.Remove(nameof(storeResult.Target.Exception));
storeResultProperties.Add("transportRequestTimeline");
storeResultProperties.Remove(nameof(storeResult.Target.TransportRequestStats));
storeResultProperties.Add("ReplicaHealthStatuses");
storeResultProperties.Remove(nameof(storeResult.Target.ReplicaHealthStatuses));

foreach (string key in jsonPropertyNames)
{
Expand Down

0 comments on commit b924209

Please sign in to comment.