From b9242094bada0cb5b05851ff077f068019b57d7b Mon Sep 17 00:00:00 2001
From: Debdatta Kunda <87335885+kundadebdatta@users.noreply.github.com>
Date: Fri, 9 Jun 2023 21:39:37 -0700
Subject: [PATCH] [Internal] Upgrade Resiliency: Adds Replica Health State
Diagnostics. (#3835)
* Code changes to add replica health status in diagnostics.
* Code changes to fix performance test build failure.
* Code changes to add health state capture logic in address cache.
* Code changes to fix benchmark test execution.
* Code changes to add tests to validate health state cache.
* Code changes to reduce default request timeout to 5 seconds.
* Revert "Code changes to reduce default request timeout to 5 seconds."
This reverts commit 139f37e588fc9dfed608431f4186c567a080e622.
---
.../src/Routing/GatewayAddressCache.cs | 23 +++++++
.../Tracing/TraceWriter.TraceJsonWriter.cs | 18 ++++++
.../Contracts/BenchmarkResults.json | 6 +-
.../TraceWriterBaselineTests.TraceData.xml | 7 ++
.../GatewayAddressCacheTests.cs | 64 +++++++++++++++++++
.../Tracing/TraceTests.cs | 2 -
6 files changed, 115 insertions(+), 5 deletions(-)
diff --git a/Microsoft.Azure.Cosmos/src/Routing/GatewayAddressCache.cs b/Microsoft.Azure.Cosmos/src/Routing/GatewayAddressCache.cs
index 182243ec40..14b641959a 100644
--- a/Microsoft.Azure.Cosmos/src/Routing/GatewayAddressCache.cs
+++ b/Microsoft.Azure.Cosmos/src/Routing/GatewayAddressCache.cs
@@ -550,10 +550,17 @@ await this.GetServerAddressesViaGatewayAsync(request, collectionRid, new[] { par
}
this.ValidateReplicaAddresses(transportAddressUris);
+ this.CaptureTransportAddressUriHealthStates(
+ partitionAddressInformation: mergedAddresses,
+ transportAddressUris: transportAddressUris);
return mergedAddresses;
}
+ this.CaptureTransportAddressUriHealthStates(
+ partitionAddressInformation: result.Item2,
+ transportAddressUris: result.Item2.Get(Protocol.Tcp)?.ReplicaTransportAddressUris);
+
return result.Item2;
}
}
@@ -936,6 +943,22 @@ TransportAddressHealthState.HealthStatus.Unknown or
TransportAddressHealthState.HealthStatus.UnhealthyPending);
}
+ ///
+ /// The replica health status of the transport address uri will change eventually with the motonically increasing time.
+ /// However, the purpose of this method is to capture the health status snapshot at this moment.
+ ///
+ /// An instance of .
+ /// A read-only list of .
+ private void CaptureTransportAddressUriHealthStates(
+ PartitionAddressInformation partitionAddressInformation,
+ IReadOnlyList transportAddressUris)
+ {
+ partitionAddressInformation
+ .Get(Protocol.Tcp)?
+ .SetTransportAddressUrisHealthState(
+ replicaHealthStates: transportAddressUris.Select(x => x.GetCurrentHealthState().GetHealthStatusDiagnosticString()).ToList());
+ }
+
protected virtual void Dispose(bool disposing)
{
if (this.disposedValue)
diff --git a/Microsoft.Azure.Cosmos/src/Tracing/TraceWriter.TraceJsonWriter.cs b/Microsoft.Azure.Cosmos/src/Tracing/TraceWriter.TraceJsonWriter.cs
index 74363c8ca1..247c9f2587 100644
--- a/Microsoft.Azure.Cosmos/src/Tracing/TraceWriter.TraceJsonWriter.cs
+++ b/Microsoft.Azure.Cosmos/src/Tracing/TraceWriter.TraceJsonWriter.cs
@@ -430,6 +430,8 @@ public void Visit(StoreResult storeResult)
this.jsonWriter.WriteFieldName("BELatencyInMs");
this.WriteStringValueOrNull(storeResult.BackendRequestDurationInMs);
+ this.WriteJsonUriArray("ReplicaHealthStatuses", storeResult.ReplicaHealthStatuses);
+
this.VisitTransportRequestStats(storeResult.TransportRequestStats);
this.jsonWriter.WriteFieldName("TransportException");
@@ -468,6 +470,22 @@ private void WriteJsonUriArray(string propertyName, IEnumerable replicaHealthStatuses)
+ {
+ this.jsonWriter.WriteFieldName(propertyName);
+ this.jsonWriter.WriteArrayStart();
+
+ if (replicaHealthStatuses != null)
+ {
+ foreach (string replicaHealthStatus in replicaHealthStatuses)
+ {
+ this.WriteStringValueOrNull(replicaHealthStatus);
+ }
+ }
+
+ this.jsonWriter.WriteArrayEnd();
+ }
+
private void WriteRegionsContactedArray(string propertyName, IEnumerable<(string, Uri)> uris)
{
this.jsonWriter.WriteFieldName(propertyName);
diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Performance.Tests/Contracts/BenchmarkResults.json b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Performance.Tests/Contracts/BenchmarkResults.json
index c97e0b5b2c..a06dabf09c 100644
--- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Performance.Tests/Contracts/BenchmarkResults.json
+++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Performance.Tests/Contracts/BenchmarkResults.json
@@ -14,7 +14,7 @@
"MockedItemBenchmark.DeleteItemNotExists;[Type=OfT]": 42172.5,
"MockedItemBenchmark.DeleteItemNotExists;[Type=OfTCustom]": 42174.75,
"MockedItemBenchmark.DeleteItemNotExists;[Type=OfTWithClientTelemetryEnabled]": 42166.5,
- "MockedItemBenchmark.DeleteItemNotExists;[Type=OfTWithDiagnosticsToString]": 56695.5,
+ "MockedItemBenchmark.DeleteItemNotExists;[Type=OfTWithDiagnosticsToString]": 63338,
"MockedItemBenchmark.DeleteItemNotExists;[Type=Stream]": 37610,
"MockedItemBenchmark.QuerySinglePartitionMultiplePages;[Type=OfT]": 13342232,
"MockedItemBenchmark.QuerySinglePartitionMultiplePages;[Type=OfTCustom]": 13341058,
@@ -34,12 +34,12 @@
"MockedItemBenchmark.ReadItemExists;[Type=OfT]": 33630.5,
"MockedItemBenchmark.ReadItemExists;[Type=OfTCustom]": 33636.25,
"MockedItemBenchmark.ReadItemExists;[Type=OfTWithClientTelemetryEnabled]": 33627.75,
- "MockedItemBenchmark.ReadItemExists;[Type=OfTWithDiagnosticsToString]": 47961.25,
+ "MockedItemBenchmark.ReadItemExists;[Type=OfTWithDiagnosticsToString]": 55044,
"MockedItemBenchmark.ReadItemExists;[Type=Stream]": 26018.25,
"MockedItemBenchmark.ReadItemNotExists;[Type=OfT]": 43489.25,
"MockedItemBenchmark.ReadItemNotExists;[Type=OfTCustom]": 43490,
"MockedItemBenchmark.ReadItemNotExists;[Type=OfTWithClientTelemetryEnabled]": 43489.25,
- "MockedItemBenchmark.ReadItemNotExists;[Type=OfTWithDiagnosticsToString]": 57420.25,
+ "MockedItemBenchmark.ReadItemNotExists;[Type=OfTWithDiagnosticsToString]": 58054,
"MockedItemBenchmark.ReadItemNotExists;[Type=Stream]": 39044,
"MockedItemBenchmark.UpdateItem;[Type=OfT]": 36591,
"MockedItemBenchmark.UpdateItem;[Type=OfTCustom]": 36594.25,
diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/BaselineTest/TestBaseline/TraceWriterBaselineTests.TraceData.xml b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/BaselineTest/TestBaseline/TraceWriterBaselineTests.TraceData.xml
index 8b6adc1c15..885699cb43 100644
--- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/BaselineTest/TestBaseline/TraceWriterBaselineTests.TraceData.xml
+++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/BaselineTest/TestBaseline/TraceWriterBaselineTests.TraceData.xml
@@ -340,6 +340,12 @@
"RequestCharge": 3.14,
"RetryAfterInMs": "9000",
"BELatencyInMs": "4.2",
+ "ReplicaHealthStatuses": [
+ "http://storephysicaladdress-1p.com:Connected",
+ "http://storephysicaladdress-2s.com:Unknown",
+ "http://storephysicaladdress-3s.com:Unhealthy",
+ "http://storephysicaladdress-4s.com:Unknown"
+ ],
"transportRequestTimeline": {
"requestTimeline": [
{
@@ -529,6 +535,7 @@
"RequestCharge": 0,
"RetryAfterInMs": null,
"BELatencyInMs": null,
+ "ReplicaHealthStatuses":[],
"transportRequestTimeline": null,
"TransportException": null
}
diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GatewayAddressCacheTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GatewayAddressCacheTests.cs
index 2d0d0a5c6a..5116eb22de 100644
--- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GatewayAddressCacheTests.cs
+++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GatewayAddressCacheTests.cs
@@ -971,6 +971,13 @@ public async Task TryGetAddressesAsync_WhenReplicaVlidationEnabled_ShouldValidat
expected: TransportAddressHealthState.HealthStatus.Connected,
actual: refreshedUri.GetCurrentHealthState().GetHealthStatus());
+ GatewayAddressCacheTests.ValidateHealthStatesInDiagnostics(
+ addressInfo: addressInfo,
+ numberOfConnectedReplicas: 0,
+ numberOfUnknownReplicas: 4,
+ numberOfUnhealthyPendingReplicas: 0,
+ numberOfUnhealthyReplicas: 0);
+
Assert.AreEqual(4, addressInfo.AllAddresses.Count);
Assert.AreEqual(1, addressInfo.AllAddresses.Count(x => x.PhysicalUri == oldAddress));
Assert.AreEqual(0, addressInfo.AllAddresses.Count(x => x.PhysicalUri == newAddress));
@@ -1008,6 +1015,14 @@ public async Task TryGetAddressesAsync_WhenReplicaVlidationEnabled_ShouldValidat
actual: refreshedUri.GetCurrentHealthState().GetHealthStatus());
mockHttpHandler.VerifyAll();
+
+ GatewayAddressCacheTests.ValidateHealthStatesInDiagnostics(
+ addressInfo: addressInfo,
+ numberOfConnectedReplicas: 2,
+ numberOfUnknownReplicas: 1,
+ numberOfUnhealthyPendingReplicas: 1,
+ numberOfUnhealthyReplicas: 0);
+
GatewayAddressCacheTests.AssertOpenConnectionHandlerAttributes(
fakeOpenConnectionHandler: fakeOpenConnectionHandler,
expectedTotalFailedAddressesToOpenCount: 0,
@@ -1106,6 +1121,13 @@ public async Task TryGetAddressesAsync_WhenReplicaVlidationEnabledAndUnhealthyUr
manualResetEvent: manualResetEvent,
shouldReset: true);
+ GatewayAddressCacheTests.ValidateHealthStatesInDiagnostics(
+ addressInfo: addressInfo,
+ numberOfConnectedReplicas: 0,
+ numberOfUnknownReplicas: 4,
+ numberOfUnhealthyPendingReplicas: 0,
+ numberOfUnhealthyReplicas: 0);
+
GatewayAddressCacheTests.AssertOpenConnectionHandlerAttributes(
fakeOpenConnectionHandler: fakeOpenConnectionHandler,
expectedTotalFailedAddressesToOpenCount: 0,
@@ -1157,6 +1179,13 @@ public async Task TryGetAddressesAsync_WhenReplicaVlidationEnabledAndUnhealthyUr
expected: TransportAddressHealthState.HealthStatus.Unhealthy,
actual: refreshedUri.GetCurrentHealthState().GetHealthStatus());
+ GatewayAddressCacheTests.ValidateHealthStatesInDiagnostics(
+ addressInfo: addressInfo,
+ numberOfConnectedReplicas: 2,
+ numberOfUnknownReplicas: 1,
+ numberOfUnhealthyPendingReplicas: 1,
+ numberOfUnhealthyReplicas: 0);
+
GatewayAddressCacheTests.AssertOpenConnectionHandlerAttributes(
fakeOpenConnectionHandler: fakeOpenConnectionHandler,
expectedTotalFailedAddressesToOpenCount: 1,
@@ -1356,6 +1385,41 @@ private static void AssertOpenConnectionHandlerAttributes(
Assert.AreEqual(expectedTotalSuccessAddressesToOpenCount, fakeOpenConnectionHandler.GetTotalSuccessfulInvocationCount());
}
+ ///
+ /// Helper method to validate and assert on the cached health states for diagnostics.
+ ///
+ /// An instance of containing the partition address information.
+ /// An integer containing the number of connected replicas to be validated inthe cached health status list.
+ /// An integer containing the number of unknown replicas to be validated inthe cached health status list.
+ /// An integer containing the number of unhealthy pending replicas to be validated inthe cached health status list.
+ /// An integer containing the number of unhealthy replicas to be validated inthe cached health status list.
+ private static void ValidateHealthStatesInDiagnostics(
+ PartitionAddressInformation addressInfo,
+ int numberOfConnectedReplicas,
+ int numberOfUnknownReplicas,
+ int numberOfUnhealthyPendingReplicas,
+ int numberOfUnhealthyReplicas)
+ {
+ IReadOnlyList replicaHealthStatuses = addressInfo.Get(Protocol.Tcp)?.ReplicaTransportAddressUrisHealthState;
+
+ Assert.IsNotNull(replicaHealthStatuses);
+ Assert.AreEqual(
+ expected: numberOfConnectedReplicas,
+ actual: replicaHealthStatuses.Where(x => x.Contains("| status: Connected |")).Count());
+
+ Assert.AreEqual(
+ expected: numberOfUnknownReplicas,
+ actual: replicaHealthStatuses.Where(x => x.Contains("| status: Unknown |")).Count());
+
+ Assert.AreEqual(
+ expected: numberOfUnhealthyPendingReplicas,
+ actual: replicaHealthStatuses.Where(x => x.Contains("| status: UnhealthyPending |")).Count());
+
+ Assert.AreEqual(
+ expected: numberOfUnhealthyReplicas,
+ actual: replicaHealthStatuses.Where(x => x.Contains("| status: Unhealthy |")).Count());
+ }
+
private class FakeMessageHandler : HttpMessageHandler
{
private bool returnFullReplicaSet;
diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Tracing/TraceTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Tracing/TraceTests.cs
index ad21070609..7c0cca1f95 100644
--- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Tracing/TraceTests.cs
+++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Tracing/TraceTests.cs
@@ -125,8 +125,6 @@ public void ValidateStoreResultSerialization()
storeResultProperties.Remove(nameof(storeResult.Target.Exception));
storeResultProperties.Add("transportRequestTimeline");
storeResultProperties.Remove(nameof(storeResult.Target.TransportRequestStats));
- storeResultProperties.Add("ReplicaHealthStatuses");
- storeResultProperties.Remove(nameof(storeResult.Target.ReplicaHealthStatuses));
foreach (string key in jsonPropertyNames)
{