From ec83171e2bfe90c5dc4de3386c66810dca3684b6 Mon Sep 17 00:00:00 2001 From: Fabian Meiswinkel Date: Wed, 11 Feb 2026 11:24:04 +0000 Subject: [PATCH 1/5] Update CrossRegionHedgingAvailabilityStrategy.cs --- .../CrossRegionHedgingAvailabilityStrategy.cs | 56 +++++++++++-------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/Microsoft.Azure.Cosmos/src/Routing/AvailabilityStrategy/CrossRegionHedgingAvailabilityStrategy.cs b/Microsoft.Azure.Cosmos/src/Routing/AvailabilityStrategy/CrossRegionHedgingAvailabilityStrategy.cs index d315b330c8..c6b005c6c0 100644 --- a/Microsoft.Azure.Cosmos/src/Routing/AvailabilityStrategy/CrossRegionHedgingAvailabilityStrategy.cs +++ b/Microsoft.Azure.Cosmos/src/Routing/AvailabilityStrategy/CrossRegionHedgingAvailabilityStrategy.cs @@ -125,23 +125,24 @@ internal bool ShouldHedge(RequestMessage request, CosmosClient client) /// /// /// - /// + /// /// The response after executing cross region hedging internal override async Task ExecuteAvailabilityStrategyAsync( Func> sender, CosmosClient client, RequestMessage request, - CancellationToken cancellationToken) + CancellationToken applicationProvidedCancellationToken) { if (!this.ShouldHedge(request, client) || client.DocumentClient.GlobalEndpointManager.ReadEndpoints.Count == 1) { - return await sender(request, cancellationToken); + return await sender(request, applicationProvidedCancellationToken); } ITrace trace = request.Trace; - using (CancellationTokenSource cancellationTokenSource = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken)) + using (CancellationTokenSource hedgeRequestsCancellationTokenSource = + CancellationTokenSource.CreateLinkedTokenSource(applicationProvidedCancellationToken)) { using (CloneableStream clonedBody = (CloneableStream)(request.Content == null ? null @@ -161,7 +162,7 @@ internal override async Task ExecuteAvailabilityStrategyAsync( { TimeSpan awaitTime = requestNumber == 0 ? this.Threshold : this.ThresholdStep; - using (CancellationTokenSource timerTokenSource = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken)) + using (CancellationTokenSource timerTokenSource = CancellationTokenSource.CreateLinkedTokenSource(applicationProvidedCancellationToken)) { CancellationToken timerToken = timerTokenSource.Token; using (Task hedgeTimer = Task.Delay(awaitTime, timerToken)) @@ -173,22 +174,32 @@ internal override async Task ExecuteAvailabilityStrategyAsync( hedgeRegions: hedgeRegions, requestNumber: requestNumber, trace: trace, - cancellationToken: cancellationToken, - cancellationTokenSource: cancellationTokenSource); + hedgeRequestsCancellationTokenSource: hedgeRequestsCancellationTokenSource); requestTasks.Add(requestTask); requestTasks.Add(hedgeTimer); - Task completedTask = await Task.WhenAny(requestTasks); - requestTasks.Remove(completedTask); + Task completedTask; + do + { + completedTask = await Task.WhenAny(requestTasks); + requestTasks.Remove(completedTask); + } + while ( + completedTask == hedgeTimer && + // Ignore hedge timer signals if either the e2e timeout is hit + // or the hedgeTimer task failed (or more commonly since this is a linked CTS was cancelled) + // in both of these cases we do not want to spawn new hedge requests + // but just consolidate the outcome of previous requests + (!completedTask.IsCompleted || applicationProvidedCancellationToken.IsCancellationRequested)); if (completedTask == hedgeTimer) { continue; } - timerTokenSource.Cancel(); requestTasks.Remove(hedgeTimer); + timerTokenSource.Cancel(); if (completedTask.IsFaulted) { @@ -198,7 +209,7 @@ internal override async Task ExecuteAvailabilityStrategyAsync( hedgeResponse = await (Task)completedTask; if (hedgeResponse.IsNonTransient) { - cancellationTokenSource.Cancel(); + hedgeRequestsCancellationTokenSource.Cancel(); ((CosmosTraceDiagnostics)hedgeResponse.ResponseMessage.Diagnostics).Value.AddOrUpdateDatum( HedgeConfig, @@ -232,7 +243,7 @@ internal override async Task ExecuteAvailabilityStrategyAsync( hedgeResponse = await (Task)completedTask; if (hedgeResponse.IsNonTransient || requestTasks.Count == 0) { - cancellationTokenSource.Cancel(); + hedgeRequestsCancellationTokenSource.Cancel(); ((CosmosTraceDiagnostics)hedgeResponse.ResponseMessage.Diagnostics).Value.AddOrUpdateDatum( HedgeConfig, this.HedgeConfigText); @@ -264,8 +275,7 @@ private async Task CloneAndSendAsync( IReadOnlyCollection hedgeRegions, int requestNumber, ITrace trace, - CancellationToken cancellationToken, - CancellationTokenSource cancellationTokenSource) + CancellationTokenSource hedgeRequestsCancellationTokenSource) { RequestMessage clonedRequest; @@ -287,8 +297,7 @@ private async Task CloneAndSendAsync( sender, clonedRequest, hedgeRegions.ElementAt(requestNumber), - cancellationToken, - cancellationTokenSource, + hedgeRequestsCancellationTokenSource, trace); } } @@ -297,18 +306,19 @@ private async Task RequestSenderAndResultCheckAsync( Func> sender, RequestMessage request, string targetRegionName, - CancellationToken cancellationToken, - CancellationTokenSource cancellationTokenSource, + CancellationTokenSource hedgeRequestsCancellationTokenSource, ITrace trace) { try { - ResponseMessage response = await sender.Invoke(request, cancellationToken); + ResponseMessage response = await sender.Invoke(request, hedgeRequestsCancellationTokenSource.Token); if (IsFinalResult((int)response.StatusCode, (int)response.Headers.SubStatusCode)) { - if (!cancellationToken.IsCancellationRequested) + if (!hedgeRequestsCancellationTokenSource.IsCancellationRequested) { - cancellationTokenSource.Cancel(); + // App has not reached e2e timeout - we can cancel any still remaining + // hedge requests since we have a final response now + hedgeRequestsCancellationTokenSource.Cancel(); } return new HedgingResponse(true, response, targetRegionName); @@ -316,8 +326,10 @@ private async Task RequestSenderAndResultCheckAsync( return new HedgingResponse(false, response, targetRegionName); } - catch (OperationCanceledException oce) when (cancellationTokenSource.IsCancellationRequested) + catch (OperationCanceledException oce) when (hedgeRequestsCancellationTokenSource.IsCancellationRequested) { + // hedgeRequestsCancellationTokenSource is a linked cancellation token source - so, would also signal + // cancellation on e2e timeout via app provided CT throw new CosmosOperationCanceledException(oce, trace); } catch (Exception ex) From 09073a52938ba188640f3eb590d4f467610ef67d Mon Sep 17 00:00:00 2001 From: Nalu Tripician <27316859+NaluTripician@users.noreply.github.com> Date: Fri, 13 Feb 2026 16:34:54 -0800 Subject: [PATCH 2/5] Add comprehensive unit tests for CrossRegionHedgingAvailabilityStrategy NullRef fix Adds 8 regression tests covering: - Hedge CTS token cancels in-flight requests (not app CT) - Sender receives hedge CTS token, not application token - App cancellation prevents spawning new hedge requests (do/while fix) - Request not accessed after disposal on cancellation - Stream-based request path (ReadItemStreamAsync) - Primary request fast return skips hedging - All-transient error handling - Concurrent hedging stress test (50 parallel requests) --- .../AvailabilityStrategyUnitTests.cs | 507 ++++++++++++++++++ 1 file changed, 507 insertions(+) diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/AvailabilityStrategyUnitTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/AvailabilityStrategyUnitTests.cs index af411016ca..4bb59de225 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/AvailabilityStrategyUnitTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/AvailabilityStrategyUnitTests.cs @@ -4,6 +4,7 @@ using System.Collections.Generic; using System.Collections.ObjectModel; using System.IO; + using System.Net; using System.Net.Http; using System.Threading; using System.Threading.Tasks; @@ -17,6 +18,46 @@ [TestClass] public class AvailabilityStrategyUnitTests { + /// + /// Helper to create a mock CosmosClient with multiple read regions configured. + /// + private static CosmosClient CreateMockClientWithRegions(int regionCount = 2) + { + Collection regions = new Collection(); + for (int i = 0; i < regionCount; i++) + { + regions.Add(new AccountRegion() + { + Name = $"Region{i}", + Endpoint = new Uri($"https://location{i}.documents.azure.com").ToString() + }); + } + + AccountProperties databaseAccount = new AccountProperties() + { + ReadLocationsInternal = regions + }; + + CosmosClient mockCosmosClient = MockCosmosUtil.CreateMockCosmosClient(); + mockCosmosClient.DocumentClient.GlobalEndpointManager + .InitializeAccountPropertiesAndStartBackgroundRefresh(databaseAccount); + + return mockCosmosClient; + } + + /// + /// Helper to create a basic read request for document operations. + /// + private static RequestMessage CreateReadRequest() + { + return new RequestMessage( + HttpMethod.Get, + new Uri("/dbs/testdb/colls/testcontainer/docs/testId", UriKind.Relative)) + { + ResourceType = ResourceType.Document, + OperationType = OperationType.Read + }; + } [TestMethod] public async Task RequestMessageCloneTests() { @@ -95,6 +136,472 @@ public async Task CancellationTokenThrowsExceptionTest() availabilityStrategy.ExecuteAvailabilityStrategyAsync(sender, mockCosmosClient, request, cts.Token)); } + /// + /// Regression test for NullReferenceException in CrossRegionHedgingAvailabilityStrategy. + /// + /// In the old code, the sender was invoked with the application-provided CancellationToken + /// instead of the hedgeRequestsCancellationTokenSource.Token. When one hedge request completed + /// with a final result and cancelled the hedgeRequestsCancellationTokenSource, the other in-flight + /// hedge requests were NOT cancelled because they held a reference to the original app CT. + /// The CloneAndSendAsync method's using block would dispose the cloned request, but the sender + /// still had a reference to the now-disposed request — causing ArgumentNullException: + /// "Value cannot be null. (Parameter 'request')". + /// + /// The fix passes hedgeRequestsCancellationTokenSource.Token to sender.Invoke() so that all + /// in-flight hedge requests are cancelled when any hedge gets a final result. + /// + [TestMethod] + public async Task HedgeCancellationCancelsInFlightRequests_NoNullRef() + { + // Arrange + CrossRegionHedgingAvailabilityStrategy availabilityStrategy = new CrossRegionHedgingAvailabilityStrategy( + threshold: TimeSpan.FromMilliseconds(10), + thresholdStep: TimeSpan.FromMilliseconds(10)); + + using RequestMessage request = CreateReadRequest(); + using CosmosClient mockCosmosClient = CreateMockClientWithRegions(3); + + int senderCallCount = 0; + bool firstRequestCancellationTokenWasCancelled = false; + + // The first request (Region0) will be slow and should be cancelled when Region1 returns. + // The second request (Region1) will return a final result quickly. + Func> sender = async (req, ct) => + { + int callNumber = Interlocked.Increment(ref senderCallCount); + + if (callNumber == 1) + { + // First request: simulate a slow request that respects cancellation. + // In the old code, this CT was the app CT and would NOT be cancelled + // when the hedge CTS was cancelled, leading to NullRef after request disposal. + TaskCompletionSource cancelledTcs = new TaskCompletionSource(); + using (ct.Register(() => + { + firstRequestCancellationTokenWasCancelled = true; + cancelledTcs.TrySetResult(true); + })) + { + await cancelledTcs.Task; + } + + // Return transient response to avoid exception propagation through the strategy + return new ResponseMessage(HttpStatusCode.ServiceUnavailable); + } + else + { + // Second request: return a final result immediately + return new ResponseMessage(HttpStatusCode.OK); + } + }; + + // Act + ResponseMessage response = await availabilityStrategy.ExecuteAvailabilityStrategyAsync( + sender, mockCosmosClient, request, CancellationToken.None); + + // Assert - we got a successful response without NullReferenceException + Assert.AreEqual(HttpStatusCode.OK, response.StatusCode); + + // The slow request should have been cancelled via the hedge CTS. + // This is the key assertion: with the fix, the sender receives hedgeRequestsCancellationTokenSource.Token. + // When the second hedge returns 200 OK, the CTS is cancelled, which cancels the first request's token. + // In the old code, the first request had the app CT (CancellationToken.None) which was never cancelled. + Assert.IsTrue(firstRequestCancellationTokenWasCancelled, + "The slow first request's cancellation token should have been cancelled when the second hedge " + + "returned a final result. This verifies hedgeRequestsCancellationTokenSource.Token is passed to sender."); + } + + /// + /// Regression test: Verifies that when a non-transient (final) response is received from one + /// hedge region, the cancellation token passed to other in-flight sender calls gets cancelled. + /// + /// In the old (buggy) code, the sender received the application's CancellationToken directly. + /// When hedgeRequestsCancellationTokenSource.Cancel() was called after a final result, + /// the app CT was NOT cancelled, so in-flight senders continued executing on disposed requests. + /// + [TestMethod] + public async Task SenderReceivesHedgeCancellationToken_NotAppToken() + { + // Arrange + CrossRegionHedgingAvailabilityStrategy availabilityStrategy = new CrossRegionHedgingAvailabilityStrategy( + threshold: TimeSpan.FromMilliseconds(10), + thresholdStep: TimeSpan.FromMilliseconds(10)); + + using RequestMessage request = CreateReadRequest(); + using CosmosClient mockCosmosClient = CreateMockClientWithRegions(3); + + List capturedTokens = new List(); + + Func> sender = async (req, ct) => + { + lock (capturedTokens) + { + capturedTokens.Add(ct); + } + + // First call: delay enough for the timer to fire and second hedge to be sent + if (capturedTokens.Count == 1) + { + await Task.Delay(TimeSpan.FromSeconds(5), ct).ContinueWith(_ => { }); + } + + return new ResponseMessage(HttpStatusCode.OK); + }; + + // Act + ResponseMessage response = await availabilityStrategy.ExecuteAvailabilityStrategyAsync( + sender, mockCosmosClient, request, CancellationToken.None); + + // Assert + Assert.AreEqual(HttpStatusCode.OK, response.StatusCode); + Assert.IsTrue(capturedTokens.Count >= 2, + $"Expected at least 2 sender calls (primary + hedge), got {capturedTokens.Count}"); + + // All tokens should be from the same linked CTS (hedgeRequestsCancellationTokenSource), + // NOT the application-provided CancellationToken.None. + // After the fix, when cancellation happens, all captured tokens should signal. + // The key assertion: after the response returns, the hedge CTS is cancelled, + // so all captured tokens should be in a cancelled state. + foreach (CancellationToken ct in capturedTokens) + { + Assert.IsTrue(ct.IsCancellationRequested, + "All sender tokens should be cancelled after a final response is received. " + + "This proves the sender gets the hedge CTS token, not the app token."); + } + } + + /// + /// Regression test: When the application-provided CancellationToken is cancelled (e.g., e2e timeout), + /// the strategy should not attempt to spawn new hedge requests. The fix adds a do/while loop + /// that checks applicationProvidedCancellationToken.IsCancellationRequested when the hedgeTimer + /// completes, preventing new requests from being cloned on an already-cancelled token. + /// + [TestMethod] + public async Task AppCancellationDuringHedging_DoesNotSpawnNewHedgeRequests() + { + // Arrange + CrossRegionHedgingAvailabilityStrategy availabilityStrategy = new CrossRegionHedgingAvailabilityStrategy( + threshold: TimeSpan.FromMilliseconds(10), + thresholdStep: TimeSpan.FromMilliseconds(10)); + + using RequestMessage request = CreateReadRequest(); + using CosmosClient mockCosmosClient = CreateMockClientWithRegions(3); + + CancellationTokenSource appCts = new CancellationTokenSource(); + int senderCallCount = 0; + + Func> sender = async (req, ct) => + { + int callNumber = Interlocked.Increment(ref senderCallCount); + + if (callNumber == 1) + { + // First request: cancel the app token after a brief delay + // This simulates an e2e timeout scenario + _ = Task.Delay(15).ContinueWith(_ => appCts.Cancel()); + + // Then wait - this will be cancelled + try + { + await Task.Delay(TimeSpan.FromSeconds(30), ct); + } + catch (OperationCanceledException) + { + throw; + } + } + + return new ResponseMessage(HttpStatusCode.OK); + }; + + // Act & Assert - should throw CosmosOperationCanceledException due to app cancellation + await Assert.ThrowsExceptionAsync( + () => availabilityStrategy.ExecuteAvailabilityStrategyAsync( + sender, mockCosmosClient, request, appCts.Token)); + + // With the fix's do/while loop, when the app CT is cancelled, the timer fires + // but the loop detects applicationProvidedCancellationToken.IsCancellationRequested + // and does NOT spawn new hedge requests. Without the fix, additional clones + // would be attempted on a cancelled token path, potentially causing NullRef. + } + + /// + /// Regression test: Simulates the exact scenario from the NullRef crash reports. + /// Multiple regions, the sender disposes the request after use. In the old code, + /// a second hedge sender could still be running with a reference to a disposed request + /// because it wasn't cancelled via the hedge CTS. This test verifies no + /// ArgumentNullException occurs. + /// + [TestMethod] + public async Task MultiRegionHedging_RequestNotAccessedAfterDisposal() + { + // Arrange + CrossRegionHedgingAvailabilityStrategy availabilityStrategy = new CrossRegionHedgingAvailabilityStrategy( + threshold: TimeSpan.FromMilliseconds(10), + thresholdStep: TimeSpan.FromMilliseconds(10)); + + using RequestMessage request = CreateReadRequest(); + using CosmosClient mockCosmosClient = CreateMockClientWithRegions(3); + + int senderCallCount = 0; + bool requestWasAccessibleOnCancellation = false; + bool firstRequestWasCancelled = false; + + Func> sender = async (req, ct) => + { + int callNumber = Interlocked.Increment(ref senderCallCount); + + if (callNumber == 1) + { + // First request: simulate slow response, check req on cancellation + TaskCompletionSource cancelledTcs = new TaskCompletionSource(); + using (ct.Register(() => + { + firstRequestWasCancelled = true; + // Verify request is still accessible at cancellation point + // In the old code, request could be null/disposed here + try + { + _ = req.ResourceType; + requestWasAccessibleOnCancellation = true; + } + catch (NullReferenceException) + { + requestWasAccessibleOnCancellation = false; + } + catch (ObjectDisposedException) + { + requestWasAccessibleOnCancellation = false; + } + + cancelledTcs.TrySetResult(true); + })) + { + await cancelledTcs.Task; + } + + // Return transient response instead of throwing to avoid faulted task propagation + return new ResponseMessage(HttpStatusCode.ServiceUnavailable); + } + + return new ResponseMessage(HttpStatusCode.OK); + }; + + // Act + ResponseMessage response = await availabilityStrategy.ExecuteAvailabilityStrategyAsync( + sender, mockCosmosClient, request, CancellationToken.None); + + // Assert + Assert.AreEqual(HttpStatusCode.OK, response.StatusCode); + Assert.IsTrue(firstRequestWasCancelled, + "The first request's token should have been cancelled when the second hedge returned a final result."); + Assert.IsTrue(requestWasAccessibleOnCancellation, + "Request should not be null/disposed when the sender is cancelled. " + + "The fix ensures in-flight requests are cancelled via hedge CTS before disposal."); + } + + /// + /// Verifies the fix works for ReadItemStreamAsync code path (from NullRef2 and NullRef3 stack traces). + /// The stream-based path uses ReadItemStreamAsync -> ProcessItemStreamAsync -> RequestInvokerHandler -> + /// CrossRegionHedgingAvailabilityStrategy. This test ensures the sender cancellation token + /// is the hedge CTS token, not the app token, for stream operations too. + /// + [TestMethod] + public async Task HedgeCancellation_StreamRequest_NoNullRef() + { + // Arrange + CrossRegionHedgingAvailabilityStrategy availabilityStrategy = new CrossRegionHedgingAvailabilityStrategy( + threshold: TimeSpan.FromMilliseconds(10), + thresholdStep: TimeSpan.FromMilliseconds(10)); + + // Create request with stream content (like ReadItemStreamAsync path) + using RequestMessage request = new RequestMessage( + HttpMethod.Get, + new Uri("/dbs/testdb/colls/testcontainer/docs/testId", UriKind.Relative)) + { + ResourceType = ResourceType.Document, + OperationType = OperationType.Read, + Content = new MemoryStream(new byte[] { 1, 2, 3 }) + }; + + using CosmosClient mockCosmosClient = CreateMockClientWithRegions(3); + + int senderCallCount = 0; + bool firstRequestCancellationTokenWasCancelled = false; + + Func> sender = async (req, ct) => + { + int callNumber = Interlocked.Increment(ref senderCallCount); + + if (callNumber == 1) + { + // Wait for cancellation via a TCS that completes on cancel + TaskCompletionSource cancelledTcs = new TaskCompletionSource(); + using (ct.Register(() => + { + firstRequestCancellationTokenWasCancelled = true; + cancelledTcs.TrySetResult(true); + })) + { + await cancelledTcs.Task; + } + + // Return transient response to avoid exception propagation + return new ResponseMessage(HttpStatusCode.ServiceUnavailable); + } + + return new ResponseMessage(HttpStatusCode.OK); + }; + + // Act + ResponseMessage response = await availabilityStrategy.ExecuteAvailabilityStrategyAsync( + sender, mockCosmosClient, request, CancellationToken.None); + + // Assert + Assert.AreEqual(HttpStatusCode.OK, response.StatusCode); + Assert.IsTrue(firstRequestCancellationTokenWasCancelled, + "Slow stream request's CT should be cancelled via hedge CTS when another hedge returns a final result."); + } + + /// + /// Verifies that when the primary request completes with a non-transient error before + /// the hedge timer fires, no additional hedged requests are sent. + /// + [TestMethod] + public async Task PrimaryRequestFinalResult_NoAdditionalHedgesSent() + { + // Arrange + CrossRegionHedgingAvailabilityStrategy availabilityStrategy = new CrossRegionHedgingAvailabilityStrategy( + threshold: TimeSpan.FromMilliseconds(5000), // Very long threshold - hedge timer won't fire + thresholdStep: TimeSpan.FromMilliseconds(5000)); + + using RequestMessage request = CreateReadRequest(); + using CosmosClient mockCosmosClient = CreateMockClientWithRegions(3); + + int senderCallCount = 0; + + Func> sender = (req, ct) => + { + Interlocked.Increment(ref senderCallCount); + return Task.FromResult(new ResponseMessage(HttpStatusCode.OK)); + }; + + // Act + ResponseMessage response = await availabilityStrategy.ExecuteAvailabilityStrategyAsync( + sender, mockCosmosClient, request, CancellationToken.None); + + // Assert + Assert.AreEqual(HttpStatusCode.OK, response.StatusCode); + Assert.AreEqual(1, senderCallCount, + "Only the primary request should be sent when it returns before the hedge timer fires."); + } + + /// + /// Tests that when all hedge requests return transient errors, the strategy + /// waits for all of them and returns the last response without throwing NullRef. + /// + [TestMethod] + public async Task AllHedgesTransientError_ReturnsLastResponse() + { + // Arrange + CrossRegionHedgingAvailabilityStrategy availabilityStrategy = new CrossRegionHedgingAvailabilityStrategy( + threshold: TimeSpan.FromMilliseconds(10), + thresholdStep: TimeSpan.FromMilliseconds(10)); + + using RequestMessage request = CreateReadRequest(); + using CosmosClient mockCosmosClient = CreateMockClientWithRegions(2); + + int senderCallCount = 0; + + Func> sender = (req, ct) => + { + Interlocked.Increment(ref senderCallCount); + // 503 Service Unavailable is a transient error (not in IsFinalResult) + return Task.FromResult(new ResponseMessage(HttpStatusCode.ServiceUnavailable)); + }; + + // Act + ResponseMessage response = await availabilityStrategy.ExecuteAvailabilityStrategyAsync( + sender, mockCosmosClient, request, CancellationToken.None); + + // Assert - should still return a response (the last one), not throw NullRef + Assert.AreEqual(HttpStatusCode.ServiceUnavailable, response.StatusCode); + Assert.IsTrue(senderCallCount >= 2, + $"Expected at least 2 sender calls (primary + hedge), got {senderCallCount}"); + } + + /// + /// Stress test: runs many concurrent executions of the hedging strategy to verify + /// no NullReferenceException occurs under concurrency pressure. + /// This reproduces the production scenario from the crash reports where multiple + /// concurrent ReadItemAsync/ReadItemStreamAsync calls trigger the race condition. + /// + [TestMethod] + public async Task ConcurrentHedgingRequests_NoNullRef() + { + // Arrange + CrossRegionHedgingAvailabilityStrategy availabilityStrategy = new CrossRegionHedgingAvailabilityStrategy( + threshold: TimeSpan.FromMilliseconds(5), + thresholdStep: TimeSpan.FromMilliseconds(5)); + + using CosmosClient mockCosmosClient = CreateMockClientWithRegions(3); + + int nullRefCount = 0; + int completedCount = 0; + const int concurrentRequests = 50; + + Func> sender = async (req, ct) => + { + // Random delay to create race conditions. Use ContinueWith to avoid + // throwing OperationCanceledException when hedge CTS is cancelled. + await Task.Delay(Random.Shared.Next(1, 20), ct).ContinueWith(_ => { }); + + if (ct.IsCancellationRequested) + { + // Return transient response instead of throwing, to simulate + // a request that was cancelled but handled gracefully + return new ResponseMessage(HttpStatusCode.ServiceUnavailable); + } + + return new ResponseMessage(HttpStatusCode.OK); + }; + + // Act + Task[] tasks = new Task[concurrentRequests]; + for (int i = 0; i < concurrentRequests; i++) + { + tasks[i] = Task.Run(async () => + { + try + { + using RequestMessage req = CreateReadRequest(); + ResponseMessage response = await availabilityStrategy.ExecuteAvailabilityStrategyAsync( + sender, mockCosmosClient, req, CancellationToken.None); + + Assert.AreEqual(HttpStatusCode.OK, response.StatusCode); + Interlocked.Increment(ref completedCount); + } + catch (ArgumentNullException) + { + Interlocked.Increment(ref nullRefCount); + } + catch (NullReferenceException) + { + Interlocked.Increment(ref nullRefCount); + } + }); + } + + await Task.WhenAll(tasks); + + // Assert + Assert.AreEqual(0, nullRefCount, + $"Detected {nullRefCount} NullReferenceException(s) out of {concurrentRequests} concurrent requests. " + + "The fix should prevent null refs by cancelling in-flight requests via hedge CTS."); + Assert.AreEqual(concurrentRequests, completedCount, + $"All {concurrentRequests} requests should complete successfully."); + } + } } \ No newline at end of file From af938f038dfeef9e0be62839cadcb4c8b95db22d Mon Sep 17 00:00:00 2001 From: Nalu Tripician <27316859+NaluTripician@users.noreply.github.com> Date: Tue, 17 Feb 2026 15:08:00 -0800 Subject: [PATCH 3/5] test fix --- .../CrossRegionHedgingAvailabilityStrategy.cs | 30 +++++++++++++++-- .../AvailabilityStrategyUnitTests.cs | 33 +++++++++++++++++++ 2 files changed, 60 insertions(+), 3 deletions(-) diff --git a/Microsoft.Azure.Cosmos/src/Routing/AvailabilityStrategy/CrossRegionHedgingAvailabilityStrategy.cs b/Microsoft.Azure.Cosmos/src/Routing/AvailabilityStrategy/CrossRegionHedgingAvailabilityStrategy.cs index c6b005c6c0..4844323698 100644 --- a/Microsoft.Azure.Cosmos/src/Routing/AvailabilityStrategy/CrossRegionHedgingAvailabilityStrategy.cs +++ b/Microsoft.Azure.Cosmos/src/Routing/AvailabilityStrategy/CrossRegionHedgingAvailabilityStrategy.cs @@ -201,9 +201,17 @@ internal override async Task ExecuteAvailabilityStrategyAsync( requestTasks.Remove(hedgeTimer); timerTokenSource.Cancel(); - if (completedTask.IsFaulted) + if (completedTask.IsFaulted || completedTask.IsCanceled) { - AggregateException innerExceptions = completedTask.Exception.Flatten(); + requestTasks.Remove(hedgeTimer); + timerTokenSource.Cancel(); + + if (applicationProvidedCancellationToken.IsCancellationRequested) + { + await (Task)completedTask; + } + + continue; } hedgeResponse = await (Task)completedTask; @@ -238,6 +246,13 @@ internal override async Task ExecuteAvailabilityStrategyAsync( { AggregateException innerExceptions = completedTask.Exception.Flatten(); lastException = innerExceptions.InnerExceptions.FirstOrDefault(); + continue; + } + + if (completedTask.IsCanceled) + { + lastException = new OperationCanceledException(); + continue; } hedgeResponse = await (Task)completedTask; @@ -262,7 +277,16 @@ internal override async Task ExecuteAvailabilityStrategyAsync( throw lastException; } - Debug.Assert(hedgeResponse != null); + if (hedgeResponse == null) + { + if (applicationProvidedCancellationToken.IsCancellationRequested) + { + throw new CosmosOperationCanceledException(new OperationCanceledException(), trace); + } + + throw new InvalidOperationException("Cross-region hedging completed without producing a response."); + } + return hedgeResponse.ResponseMessage; } } diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/AvailabilityStrategyUnitTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/AvailabilityStrategyUnitTests.cs index 4bb59de225..8123290a06 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/AvailabilityStrategyUnitTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/AvailabilityStrategyUnitTests.cs @@ -602,6 +602,39 @@ public async Task ConcurrentHedgingRequests_NoNullRef() $"All {concurrentRequests} requests should complete successfully."); } + [TestMethod] + public async Task FaultedHedgeTask_DoesNotAbortWhenOtherRegionSucceeds() + { + CrossRegionHedgingAvailabilityStrategy availabilityStrategy = new CrossRegionHedgingAvailabilityStrategy( + threshold: TimeSpan.FromMilliseconds(10), + thresholdStep: TimeSpan.FromMilliseconds(10)); + + using RequestMessage request = CreateReadRequest(); + using CosmosClient mockCosmosClient = CreateMockClientWithRegions(2); + + int senderCallCount = 0; + + Func> sender = (req, ct) => + { + int callNumber = Interlocked.Increment(ref senderCallCount); + if (callNumber == 1) + { + throw new OperationCanceledException("Simulated faulted hedge task"); + } + + return Task.FromResult(new ResponseMessage(HttpStatusCode.OK)); + }; + + ResponseMessage response = await availabilityStrategy.ExecuteAvailabilityStrategyAsync( + sender, + mockCosmosClient, + request, + CancellationToken.None); + + Assert.AreEqual(HttpStatusCode.OK, response.StatusCode); + Assert.IsTrue(senderCallCount >= 2, "Expected a second hedge request to complete successfully."); + } + } } \ No newline at end of file From f65b00e702d1ac5bf08c5c019f9996b4a138ad31 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 19 Feb 2026 22:19:40 +0000 Subject: [PATCH 4/5] Initial plan From 32f5196887ae9028be0325e5f81aee2fdb880daa Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 19 Feb 2026 22:24:01 +0000 Subject: [PATCH 5/5] Fix dead code: replace !IsCompleted with IsFaulted || IsCanceled in hedge timer loop Co-authored-by: kundadebdatta <87335885+kundadebdatta@users.noreply.github.com> --- .../CrossRegionHedgingAvailabilityStrategy.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Microsoft.Azure.Cosmos/src/Routing/AvailabilityStrategy/CrossRegionHedgingAvailabilityStrategy.cs b/Microsoft.Azure.Cosmos/src/Routing/AvailabilityStrategy/CrossRegionHedgingAvailabilityStrategy.cs index 4844323698..bddb342324 100644 --- a/Microsoft.Azure.Cosmos/src/Routing/AvailabilityStrategy/CrossRegionHedgingAvailabilityStrategy.cs +++ b/Microsoft.Azure.Cosmos/src/Routing/AvailabilityStrategy/CrossRegionHedgingAvailabilityStrategy.cs @@ -191,7 +191,7 @@ internal override async Task ExecuteAvailabilityStrategyAsync( // or the hedgeTimer task failed (or more commonly since this is a linked CTS was cancelled) // in both of these cases we do not want to spawn new hedge requests // but just consolidate the outcome of previous requests - (!completedTask.IsCompleted || applicationProvidedCancellationToken.IsCancellationRequested)); + (completedTask.IsFaulted || completedTask.IsCanceled || applicationProvidedCancellationToken.IsCancellationRequested)); if (completedTask == hedgeTimer) {