From b9242094bada0cb5b05851ff077f068019b57d7b Mon Sep 17 00:00:00 2001 From: Debdatta Kunda <87335885+kundadebdatta@users.noreply.github.com> Date: Fri, 9 Jun 2023 21:39:37 -0700 Subject: [PATCH] [Internal] Upgrade Resiliency: Adds Replica Health State Diagnostics. (#3835) * Code changes to add replica health status in diagnostics. * Code changes to fix performance test build failure. * Code changes to add health state capture logic in address cache. * Code changes to fix benchmark test execution. * Code changes to add tests to validate health state cache. * Code changes to reduce default request timeout to 5 seconds. * Revert "Code changes to reduce default request timeout to 5 seconds." This reverts commit 139f37e588fc9dfed608431f4186c567a080e622. --- .../src/Routing/GatewayAddressCache.cs | 23 +++++++ .../Tracing/TraceWriter.TraceJsonWriter.cs | 18 ++++++ .../Contracts/BenchmarkResults.json | 6 +- .../TraceWriterBaselineTests.TraceData.xml | 7 ++ .../GatewayAddressCacheTests.cs | 64 +++++++++++++++++++ .../Tracing/TraceTests.cs | 2 - 6 files changed, 115 insertions(+), 5 deletions(-) diff --git a/Microsoft.Azure.Cosmos/src/Routing/GatewayAddressCache.cs b/Microsoft.Azure.Cosmos/src/Routing/GatewayAddressCache.cs index 182243ec40..14b641959a 100644 --- a/Microsoft.Azure.Cosmos/src/Routing/GatewayAddressCache.cs +++ b/Microsoft.Azure.Cosmos/src/Routing/GatewayAddressCache.cs @@ -550,10 +550,17 @@ await this.GetServerAddressesViaGatewayAsync(request, collectionRid, new[] { par } this.ValidateReplicaAddresses(transportAddressUris); + this.CaptureTransportAddressUriHealthStates( + partitionAddressInformation: mergedAddresses, + transportAddressUris: transportAddressUris); return mergedAddresses; } + this.CaptureTransportAddressUriHealthStates( + partitionAddressInformation: result.Item2, + transportAddressUris: result.Item2.Get(Protocol.Tcp)?.ReplicaTransportAddressUris); + return result.Item2; } } @@ -936,6 +943,22 @@ TransportAddressHealthState.HealthStatus.Unknown or TransportAddressHealthState.HealthStatus.UnhealthyPending); } + /// + /// The replica health status of the transport address uri will change eventually with the motonically increasing time. + /// However, the purpose of this method is to capture the health status snapshot at this moment. + /// + /// An instance of . + /// A read-only list of . + private void CaptureTransportAddressUriHealthStates( + PartitionAddressInformation partitionAddressInformation, + IReadOnlyList transportAddressUris) + { + partitionAddressInformation + .Get(Protocol.Tcp)? + .SetTransportAddressUrisHealthState( + replicaHealthStates: transportAddressUris.Select(x => x.GetCurrentHealthState().GetHealthStatusDiagnosticString()).ToList()); + } + protected virtual void Dispose(bool disposing) { if (this.disposedValue) diff --git a/Microsoft.Azure.Cosmos/src/Tracing/TraceWriter.TraceJsonWriter.cs b/Microsoft.Azure.Cosmos/src/Tracing/TraceWriter.TraceJsonWriter.cs index 74363c8ca1..247c9f2587 100644 --- a/Microsoft.Azure.Cosmos/src/Tracing/TraceWriter.TraceJsonWriter.cs +++ b/Microsoft.Azure.Cosmos/src/Tracing/TraceWriter.TraceJsonWriter.cs @@ -430,6 +430,8 @@ public void Visit(StoreResult storeResult) this.jsonWriter.WriteFieldName("BELatencyInMs"); this.WriteStringValueOrNull(storeResult.BackendRequestDurationInMs); + this.WriteJsonUriArray("ReplicaHealthStatuses", storeResult.ReplicaHealthStatuses); + this.VisitTransportRequestStats(storeResult.TransportRequestStats); this.jsonWriter.WriteFieldName("TransportException"); @@ -468,6 +470,22 @@ private void WriteJsonUriArray(string propertyName, IEnumerable replicaHealthStatuses) + { + this.jsonWriter.WriteFieldName(propertyName); + this.jsonWriter.WriteArrayStart(); + + if (replicaHealthStatuses != null) + { + foreach (string replicaHealthStatus in replicaHealthStatuses) + { + this.WriteStringValueOrNull(replicaHealthStatus); + } + } + + this.jsonWriter.WriteArrayEnd(); + } + private void WriteRegionsContactedArray(string propertyName, IEnumerable<(string, Uri)> uris) { this.jsonWriter.WriteFieldName(propertyName); diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Performance.Tests/Contracts/BenchmarkResults.json b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Performance.Tests/Contracts/BenchmarkResults.json index c97e0b5b2c..a06dabf09c 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Performance.Tests/Contracts/BenchmarkResults.json +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Performance.Tests/Contracts/BenchmarkResults.json @@ -14,7 +14,7 @@ "MockedItemBenchmark.DeleteItemNotExists;[Type=OfT]": 42172.5, "MockedItemBenchmark.DeleteItemNotExists;[Type=OfTCustom]": 42174.75, "MockedItemBenchmark.DeleteItemNotExists;[Type=OfTWithClientTelemetryEnabled]": 42166.5, - "MockedItemBenchmark.DeleteItemNotExists;[Type=OfTWithDiagnosticsToString]": 56695.5, + "MockedItemBenchmark.DeleteItemNotExists;[Type=OfTWithDiagnosticsToString]": 63338, "MockedItemBenchmark.DeleteItemNotExists;[Type=Stream]": 37610, "MockedItemBenchmark.QuerySinglePartitionMultiplePages;[Type=OfT]": 13342232, "MockedItemBenchmark.QuerySinglePartitionMultiplePages;[Type=OfTCustom]": 13341058, @@ -34,12 +34,12 @@ "MockedItemBenchmark.ReadItemExists;[Type=OfT]": 33630.5, "MockedItemBenchmark.ReadItemExists;[Type=OfTCustom]": 33636.25, "MockedItemBenchmark.ReadItemExists;[Type=OfTWithClientTelemetryEnabled]": 33627.75, - "MockedItemBenchmark.ReadItemExists;[Type=OfTWithDiagnosticsToString]": 47961.25, + "MockedItemBenchmark.ReadItemExists;[Type=OfTWithDiagnosticsToString]": 55044, "MockedItemBenchmark.ReadItemExists;[Type=Stream]": 26018.25, "MockedItemBenchmark.ReadItemNotExists;[Type=OfT]": 43489.25, "MockedItemBenchmark.ReadItemNotExists;[Type=OfTCustom]": 43490, "MockedItemBenchmark.ReadItemNotExists;[Type=OfTWithClientTelemetryEnabled]": 43489.25, - "MockedItemBenchmark.ReadItemNotExists;[Type=OfTWithDiagnosticsToString]": 57420.25, + "MockedItemBenchmark.ReadItemNotExists;[Type=OfTWithDiagnosticsToString]": 58054, "MockedItemBenchmark.ReadItemNotExists;[Type=Stream]": 39044, "MockedItemBenchmark.UpdateItem;[Type=OfT]": 36591, "MockedItemBenchmark.UpdateItem;[Type=OfTCustom]": 36594.25, diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/BaselineTest/TestBaseline/TraceWriterBaselineTests.TraceData.xml b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/BaselineTest/TestBaseline/TraceWriterBaselineTests.TraceData.xml index 8b6adc1c15..885699cb43 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/BaselineTest/TestBaseline/TraceWriterBaselineTests.TraceData.xml +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/BaselineTest/TestBaseline/TraceWriterBaselineTests.TraceData.xml @@ -340,6 +340,12 @@ "RequestCharge": 3.14, "RetryAfterInMs": "9000", "BELatencyInMs": "4.2", + "ReplicaHealthStatuses": [ + "http://storephysicaladdress-1p.com:Connected", + "http://storephysicaladdress-2s.com:Unknown", + "http://storephysicaladdress-3s.com:Unhealthy", + "http://storephysicaladdress-4s.com:Unknown" + ], "transportRequestTimeline": { "requestTimeline": [ { @@ -529,6 +535,7 @@ "RequestCharge": 0, "RetryAfterInMs": null, "BELatencyInMs": null, + "ReplicaHealthStatuses":[], "transportRequestTimeline": null, "TransportException": null } diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GatewayAddressCacheTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GatewayAddressCacheTests.cs index 2d0d0a5c6a..5116eb22de 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GatewayAddressCacheTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GatewayAddressCacheTests.cs @@ -971,6 +971,13 @@ public async Task TryGetAddressesAsync_WhenReplicaVlidationEnabled_ShouldValidat expected: TransportAddressHealthState.HealthStatus.Connected, actual: refreshedUri.GetCurrentHealthState().GetHealthStatus()); + GatewayAddressCacheTests.ValidateHealthStatesInDiagnostics( + addressInfo: addressInfo, + numberOfConnectedReplicas: 0, + numberOfUnknownReplicas: 4, + numberOfUnhealthyPendingReplicas: 0, + numberOfUnhealthyReplicas: 0); + Assert.AreEqual(4, addressInfo.AllAddresses.Count); Assert.AreEqual(1, addressInfo.AllAddresses.Count(x => x.PhysicalUri == oldAddress)); Assert.AreEqual(0, addressInfo.AllAddresses.Count(x => x.PhysicalUri == newAddress)); @@ -1008,6 +1015,14 @@ public async Task TryGetAddressesAsync_WhenReplicaVlidationEnabled_ShouldValidat actual: refreshedUri.GetCurrentHealthState().GetHealthStatus()); mockHttpHandler.VerifyAll(); + + GatewayAddressCacheTests.ValidateHealthStatesInDiagnostics( + addressInfo: addressInfo, + numberOfConnectedReplicas: 2, + numberOfUnknownReplicas: 1, + numberOfUnhealthyPendingReplicas: 1, + numberOfUnhealthyReplicas: 0); + GatewayAddressCacheTests.AssertOpenConnectionHandlerAttributes( fakeOpenConnectionHandler: fakeOpenConnectionHandler, expectedTotalFailedAddressesToOpenCount: 0, @@ -1106,6 +1121,13 @@ public async Task TryGetAddressesAsync_WhenReplicaVlidationEnabledAndUnhealthyUr manualResetEvent: manualResetEvent, shouldReset: true); + GatewayAddressCacheTests.ValidateHealthStatesInDiagnostics( + addressInfo: addressInfo, + numberOfConnectedReplicas: 0, + numberOfUnknownReplicas: 4, + numberOfUnhealthyPendingReplicas: 0, + numberOfUnhealthyReplicas: 0); + GatewayAddressCacheTests.AssertOpenConnectionHandlerAttributes( fakeOpenConnectionHandler: fakeOpenConnectionHandler, expectedTotalFailedAddressesToOpenCount: 0, @@ -1157,6 +1179,13 @@ public async Task TryGetAddressesAsync_WhenReplicaVlidationEnabledAndUnhealthyUr expected: TransportAddressHealthState.HealthStatus.Unhealthy, actual: refreshedUri.GetCurrentHealthState().GetHealthStatus()); + GatewayAddressCacheTests.ValidateHealthStatesInDiagnostics( + addressInfo: addressInfo, + numberOfConnectedReplicas: 2, + numberOfUnknownReplicas: 1, + numberOfUnhealthyPendingReplicas: 1, + numberOfUnhealthyReplicas: 0); + GatewayAddressCacheTests.AssertOpenConnectionHandlerAttributes( fakeOpenConnectionHandler: fakeOpenConnectionHandler, expectedTotalFailedAddressesToOpenCount: 1, @@ -1356,6 +1385,41 @@ private static void AssertOpenConnectionHandlerAttributes( Assert.AreEqual(expectedTotalSuccessAddressesToOpenCount, fakeOpenConnectionHandler.GetTotalSuccessfulInvocationCount()); } + /// + /// Helper method to validate and assert on the cached health states for diagnostics. + /// + /// An instance of containing the partition address information. + /// An integer containing the number of connected replicas to be validated inthe cached health status list. + /// An integer containing the number of unknown replicas to be validated inthe cached health status list. + /// An integer containing the number of unhealthy pending replicas to be validated inthe cached health status list. + /// An integer containing the number of unhealthy replicas to be validated inthe cached health status list. + private static void ValidateHealthStatesInDiagnostics( + PartitionAddressInformation addressInfo, + int numberOfConnectedReplicas, + int numberOfUnknownReplicas, + int numberOfUnhealthyPendingReplicas, + int numberOfUnhealthyReplicas) + { + IReadOnlyList replicaHealthStatuses = addressInfo.Get(Protocol.Tcp)?.ReplicaTransportAddressUrisHealthState; + + Assert.IsNotNull(replicaHealthStatuses); + Assert.AreEqual( + expected: numberOfConnectedReplicas, + actual: replicaHealthStatuses.Where(x => x.Contains("| status: Connected |")).Count()); + + Assert.AreEqual( + expected: numberOfUnknownReplicas, + actual: replicaHealthStatuses.Where(x => x.Contains("| status: Unknown |")).Count()); + + Assert.AreEqual( + expected: numberOfUnhealthyPendingReplicas, + actual: replicaHealthStatuses.Where(x => x.Contains("| status: UnhealthyPending |")).Count()); + + Assert.AreEqual( + expected: numberOfUnhealthyReplicas, + actual: replicaHealthStatuses.Where(x => x.Contains("| status: Unhealthy |")).Count()); + } + private class FakeMessageHandler : HttpMessageHandler { private bool returnFullReplicaSet; diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Tracing/TraceTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Tracing/TraceTests.cs index ad21070609..7c0cca1f95 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Tracing/TraceTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Tracing/TraceTests.cs @@ -125,8 +125,6 @@ public void ValidateStoreResultSerialization() storeResultProperties.Remove(nameof(storeResult.Target.Exception)); storeResultProperties.Add("transportRequestTimeline"); storeResultProperties.Remove(nameof(storeResult.Target.TransportRequestStats)); - storeResultProperties.Add("ReplicaHealthStatuses"); - storeResultProperties.Remove(nameof(storeResult.Target.ReplicaHealthStatuses)); foreach (string key in jsonPropertyNames) {