From 2cffad72cce810f2d5b2cde972de5c84298b3a36 Mon Sep 17 00:00:00 2001 From: Nalu Tripician <27316859+NaluTripician@users.noreply.github.com> Date: Thu, 26 Feb 2026 12:38:47 -0800 Subject: [PATCH 1/4] [Internal] Improve test reliability across flaky and nightly tests - Increase timing margins in AvailabilityStrategyNoTriggerTest (200ms->500ms delay, 150ms->300ms threshold) - Increase hedging threshold in AvailabilityStrategyAllFaultsTests (100ms->200ms) - Increase timing margins in AppCancellationDuringHedging (10ms->100ms threshold, 15ms->150ms cancel delay) - Add [DoNotParallelize] and retry logic to DistributedTransactionE2ETests - Increase replication delay in CircuitBreaker integration tests (3s->5s) - Increase timing margins in CosmosHttpClientCoreTests retry tests - Increase delay margin in GlobalEndpointManagerTest (3s->5s) - Add bounded polling loop in CosmosAuthorizationTests background refresh - Replace fixed delay with polling in BatchAsyncStreamerTests congestion control - Increase delay in PartitionControllerTests lease release (100ms->500ms) - Add [Timeout] attributes to all flaky-tagged tests missing them Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../ClientTelemetryTests.cs | 12 +++++++++++ .../CosmosAvailabilityStrategyTests.cs | 6 +++--- .../CosmosItemIntegrationTests.cs | 4 ++-- .../DistributedTransactionE2ETests.cs | 21 +++++++++++++++---- .../EndToEndTraceWriterBaselineTests.cs | 2 ++ .../AvailabilityStrategyUnitTests.cs | 6 +++--- .../Batch/BatchAsyncStreamerTests.cs | 9 ++++++-- .../ChangeFeed/PartitionControllerTests.cs | 2 +- .../CosmosAuthorizationTests.cs | 4 +++- .../CosmosHttpClientCoreTests.cs | 16 +++++++------- .../GlobalEndpointManagerTest.cs | 5 +++-- 11 files changed, 62 insertions(+), 25 deletions(-) diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/ClientTelemetryTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/ClientTelemetryTests.cs index ea844ce0db..331369f98c 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/ClientTelemetryTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/ClientTelemetryTests.cs @@ -77,6 +77,7 @@ public override async Task Cleanup() } [TestMethod] + [Timeout(300000)] [DataRow(ConnectionMode.Direct, true)] [DataRow(ConnectionMode.Gateway, true)] [DataRow(ConnectionMode.Direct, false)] @@ -87,6 +88,7 @@ public override async Task PointSuccessOperationsTest(ConnectionMode mode, bool } [TestMethod] + [Timeout(300000)] [DataRow(ConnectionMode.Direct)] [DataRow(ConnectionMode.Gateway)] public override async Task PointReadFailureOperationsTest(ConnectionMode mode) @@ -95,6 +97,7 @@ public override async Task PointReadFailureOperationsTest(ConnectionMode mode) } [TestMethod] + [Timeout(300000)] [DataRow(ConnectionMode.Direct)] [DataRow(ConnectionMode.Gateway)] public override async Task StreamReadFailureOperationsTest(ConnectionMode mode) @@ -103,6 +106,7 @@ public override async Task StreamReadFailureOperationsTest(ConnectionMode mode) } [TestMethod] + [Timeout(300000)] [DataRow(ConnectionMode.Direct)] [DataRow(ConnectionMode.Gateway)] public override async Task StreamOperationsTest(ConnectionMode mode) @@ -111,6 +115,7 @@ public override async Task StreamOperationsTest(ConnectionMode mode) } [TestMethod] + [Timeout(300000)] [DataRow(ConnectionMode.Direct)] [DataRow(ConnectionMode.Gateway)] public override async Task BatchOperationsTest(ConnectionMode mode) @@ -119,6 +124,7 @@ public override async Task BatchOperationsTest(ConnectionMode mode) } [TestMethod] + [Timeout(300000)] [DataRow(ConnectionMode.Direct)] [DataRow(ConnectionMode.Gateway)] public override async Task SingleOperationMultipleTimesTest(ConnectionMode mode) @@ -127,6 +133,7 @@ public override async Task SingleOperationMultipleTimesTest(ConnectionMode mode) } [TestMethod] + [Timeout(300000)] [DataRow(ConnectionMode.Direct)] [DataRow(ConnectionMode.Gateway)] public override async Task QueryOperationSinglePartitionTest(ConnectionMode mode) @@ -135,6 +142,7 @@ public override async Task QueryOperationSinglePartitionTest(ConnectionMode mode } [TestMethod] + [Timeout(300000)] [DataRow(ConnectionMode.Direct)] [DataRow(ConnectionMode.Gateway)] public override async Task QueryMultiPageSinglePartitionOperationTest(ConnectionMode mode) @@ -143,6 +151,7 @@ public override async Task QueryMultiPageSinglePartitionOperationTest(Connection } [TestMethod] + [Timeout(300000)] [DataRow(ConnectionMode.Direct)] [DataRow(ConnectionMode.Gateway)] public override async Task QueryOperationCrossPartitionTest(ConnectionMode mode) @@ -151,6 +160,7 @@ public override async Task QueryOperationCrossPartitionTest(ConnectionMode mode) } [TestMethod] + [Timeout(300000)] [DataRow(ConnectionMode.Direct)] [DataRow(ConnectionMode.Gateway)] public override async Task QueryOperationMutiplePageCrossPartitionTest(ConnectionMode mode) @@ -159,6 +169,7 @@ public override async Task QueryOperationMutiplePageCrossPartitionTest(Connectio } [TestMethod] + [Timeout(300000)] [DataRow(ConnectionMode.Direct)] [DataRow(ConnectionMode.Gateway)] public override async Task QueryOperationInvalidContinuationTokenTest(ConnectionMode mode) @@ -167,6 +178,7 @@ public override async Task QueryOperationInvalidContinuationTokenTest(Connection } [TestMethod] + [Timeout(300000)] [DataRow(ConnectionMode.Direct)] public override async Task CreateItemWithSubStatusCodeTest(ConnectionMode mode) { diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosAvailabilityStrategyTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosAvailabilityStrategyTests.cs index 2b847ef133..edd70b9bc3 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosAvailabilityStrategyTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosAvailabilityStrategyTests.cs @@ -214,7 +214,7 @@ public async Task AvailabilityStrategyNoTriggerTest(bool isPreferredLocationsEmp .Build(), result: FaultInjectionResultBuilder.GetResultBuilder(FaultInjectionServerErrorType.ResponseDelay) - .WithDelay(TimeSpan.FromMilliseconds(200)) + .WithDelay(TimeSpan.FromMilliseconds(500)) .Build()) .WithDuration(TimeSpan.FromMinutes(90)) .Build(); @@ -243,7 +243,7 @@ public async Task AvailabilityStrategyNoTriggerTest(bool isPreferredLocationsEmp ConnectionMode = ConnectionMode.Direct, ApplicationPreferredRegions = isPreferredLocationsEmpty ? new List() : new List() { region1, region2 }, AvailabilityStrategy = AvailabilityStrategy.CrossRegionHedgingStrategy( - threshold: TimeSpan.FromMilliseconds(150), + threshold: TimeSpan.FromMilliseconds(300), thresholdStep: TimeSpan.FromMilliseconds(50)), Serializer = this.cosmosSystemTextJsonSerializer }; @@ -585,7 +585,7 @@ public async Task AvailabilityStrategyAllFaultsTests(string operation, string co ConnectionMode = ConnectionMode.Direct, ApplicationPreferredRegions = isPreferredLocationsEmpty ? new List() :new List() { region1, region2 }, AvailabilityStrategy = AvailabilityStrategy.CrossRegionHedgingStrategy( - threshold: TimeSpan.FromMilliseconds(100), + threshold: TimeSpan.FromMilliseconds(200), thresholdStep: TimeSpan.FromMilliseconds(50)), Serializer = this.cosmosSystemTextJsonSerializer }; diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosItemIntegrationTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosItemIntegrationTests.cs index a706f0085f..98ffcbd899 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosItemIntegrationTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosItemIntegrationTests.cs @@ -798,7 +798,7 @@ public async Task ReadItemAsync_WithCircuitBreakerEnabledAndSingleMasterAccountA await this.TryCreateItems(itemsList); //Must Ensure the data is replicated to all regions - await Task.Delay(3000); + await Task.Delay(5000); bool isRegion1Available = true; bool isRegion2Available = true; @@ -1061,7 +1061,7 @@ public async Task ReadItemAsync_WithCircuitBreakerDisabledAndSingleMasterAccount await this.TryCreateItems(itemsList); //Must Ensure the data is replicated to all regions - await Task.Delay(3000); + await Task.Delay(5000); int consecutiveFailureCount = 10; for (int attemptCount = 1; attemptCount <= consecutiveFailureCount; attemptCount++) diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/DistributedTransaction/DistributedTransactionE2ETests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/DistributedTransaction/DistributedTransactionE2ETests.cs index abc5ae55a3..5e2bf393c8 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/DistributedTransaction/DistributedTransactionE2ETests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/DistributedTransaction/DistributedTransactionE2ETests.cs @@ -17,6 +17,7 @@ namespace Microsoft.Azure.Cosmos.SDK.EmulatorTests using OperationType = Documents.OperationType; [TestClass] + [DoNotParallelize] public class DistributedTransactionE2ETests : BaseCosmosClientHelper { private const string IdempotencyTokenHeader = "x-ms-cosmos-idempotency-token"; @@ -29,11 +30,23 @@ public async Task TestInitialize() { await this.TestInit(); - ContainerResponse response = await this.database.CreateContainerAsync( - new ContainerProperties(id: Guid.NewGuid().ToString(), partitionKeyPath: PartitionKeyPath), - cancellationToken: this.cancellationToken); + const int maxRetries = 3; + for (int attempt = 0; attempt < maxRetries; attempt++) + { + try + { + ContainerResponse response = await this.database.CreateContainerAsync( + new ContainerProperties(id: Guid.NewGuid().ToString(), partitionKeyPath: PartitionKeyPath), + cancellationToken: this.cancellationToken); - this.container = response.Container; + this.container = response.Container; + break; + } + catch (CosmosException) when (attempt < maxRetries - 1) + { + await Task.Delay(TimeSpan.FromSeconds(Math.Pow(2, attempt)), this.cancellationToken); + } + } } [TestCleanup] diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/Tracing/EndToEndTraceWriterBaselineTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/Tracing/EndToEndTraceWriterBaselineTests.cs index 9e8e2250c1..4b35dd1f22 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/Tracing/EndToEndTraceWriterBaselineTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/Tracing/EndToEndTraceWriterBaselineTests.cs @@ -494,6 +494,7 @@ public async Task ChangeFeedAsync() [TestMethod] [TestCategory("Flaky")] + [Timeout(300000)] public async Task QueryAsync() { List inputs = new List(); @@ -818,6 +819,7 @@ public async Task ValidateInvalidCredentialsTraceAsync() [TestMethod] [TestCategory("Flaky")] + [Timeout(300000)] public async Task TypedPointOperationsAsync() { List inputs = new List(); diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/AvailabilityStrategyUnitTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/AvailabilityStrategyUnitTests.cs index 8123290a06..b803e02647 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/AvailabilityStrategyUnitTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/AvailabilityStrategyUnitTests.cs @@ -281,8 +281,8 @@ public async Task AppCancellationDuringHedging_DoesNotSpawnNewHedgeRequests() { // Arrange CrossRegionHedgingAvailabilityStrategy availabilityStrategy = new CrossRegionHedgingAvailabilityStrategy( - threshold: TimeSpan.FromMilliseconds(10), - thresholdStep: TimeSpan.FromMilliseconds(10)); + threshold: TimeSpan.FromMilliseconds(100), + thresholdStep: TimeSpan.FromMilliseconds(100)); using RequestMessage request = CreateReadRequest(); using CosmosClient mockCosmosClient = CreateMockClientWithRegions(3); @@ -298,7 +298,7 @@ public async Task AppCancellationDuringHedging_DoesNotSpawnNewHedgeRequests() { // First request: cancel the app token after a brief delay // This simulates an e2e timeout scenario - _ = Task.Delay(15).ContinueWith(_ => appCts.Cancel()); + _ = Task.Delay(150).ContinueWith(_ => appCts.Cancel()); // Then wait - this will be cancelled try diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Batch/BatchAsyncStreamerTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Batch/BatchAsyncStreamerTests.cs index 2ee53dbdc2..9da46da620 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Batch/BatchAsyncStreamerTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/Batch/BatchAsyncStreamerTests.cs @@ -150,9 +150,14 @@ public async Task ValidatesCongestionControlAsync() // 300 batch request should atleast sum up to 1000 ms barrier with wait time of 20ms in executor await Task.WhenAll(contexts); - await Task.Delay(2000); + // Poll for semaphore count to increase, with a reasonable timeout + System.Diagnostics.Stopwatch sw = System.Diagnostics.Stopwatch.StartNew(); + while (newLimiter.CurrentCount < 2 && sw.Elapsed < TimeSpan.FromSeconds(10)) + { + await Task.Delay(200); + } - Assert.IsTrue(newLimiter.CurrentCount >= 2, "Count of threads that can enter into semaphore should increase atleast by 1"); + Assert.IsTrue(newLimiter.CurrentCount >= 2, $"Count of threads that can enter into semaphore should increase atleast by 1. Actual: {newLimiter.CurrentCount}"); } [TestMethod] diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ChangeFeed/PartitionControllerTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ChangeFeed/PartitionControllerTests.cs index 7da78abce1..b5e1bcae61 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ChangeFeed/PartitionControllerTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ChangeFeed/PartitionControllerTests.cs @@ -229,7 +229,7 @@ public async Task Controller_ShouldReleasesLease_IfObserverExits() .Returns(new PartitionSupervisorCore(this.lease, this.observer, this.partitionProcessor, this.leaseRenewer)); await this.sut.AddOrUpdateLeaseAsync(this.lease).ConfigureAwait(false); - await Task.Delay(TimeSpan.FromMilliseconds(100)).ConfigureAwait(false); + await Task.Delay(TimeSpan.FromMilliseconds(500)).ConfigureAwait(false); Mock.Get(this.leaseManager) .Verify(manager => manager.ReleaseAsync(It.IsAny()), Times.Once); diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosAuthorizationTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosAuthorizationTests.cs index b98c339402..43437e1488 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosAuthorizationTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosAuthorizationTests.cs @@ -329,9 +329,11 @@ public async Task TestTokenCredentialBackgroundRefreshAsync() Assert.AreEqual(token1, t2); // Wait until the background refresh occurs. + Stopwatch sw = Stopwatch.StartNew(); while (testTokenCredential.NumTimesInvoked == 1) { - await Task.Delay(500); + Assert.IsTrue(sw.Elapsed < TimeSpan.FromSeconds(20), "Background token refresh did not occur within 20 seconds."); + await Task.Delay(200); } string t3 = await tokenCredentialCache.GetTokenAsync(NoOpTrace.Singleton); diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosHttpClientCoreTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosHttpClientCoreTests.cs index 4d4eb201a2..300724ba93 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosHttpClientCoreTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/CosmosHttpClientCoreTests.cs @@ -57,6 +57,7 @@ static Task sendFunc(HttpRequestMessage request, Cancellati [TestMethod] [TestCategory("Flaky")] + [Timeout(120000)] public async Task RetryTransientIssuesTestAsync() { using CancellationTokenSource cancellationTokenSource1 = new CancellationTokenSource(); @@ -68,15 +69,15 @@ public async Task RetryTransientIssuesTestAsync() { {HttpTimeoutPolicyControlPlaneRead.Instance, new List() { - TimeSpan.FromSeconds(5.1), - TimeSpan.FromSeconds(10.1), - TimeSpan.FromSeconds(20.1) + TimeSpan.FromSeconds(6), + TimeSpan.FromSeconds(11), + TimeSpan.FromSeconds(21) }}, {HttpTimeoutPolicyControlPlaneRetriableHotPath.Instance, new List() { - TimeSpan.FromSeconds(.6), - TimeSpan.FromSeconds(5.1), - TimeSpan.FromSeconds(65.1) + TimeSpan.FromSeconds(1), + TimeSpan.FromSeconds(6), + TimeSpan.FromSeconds(66) }}, }; @@ -388,6 +389,7 @@ Task sendFunc(HttpRequestMessage request, CancellationToken [TestMethod] [TestCategory("Flaky")] + [Timeout(120000)] public async Task RetryTransientIssuesForQueryPlanTestAsync() { DocumentServiceRequest documentServiceRequest = DocumentServiceRequest.Create( @@ -411,7 +413,7 @@ async Task sendFunc(HttpRequestMessage request, Cancellatio if (count <= 2) { Assert.IsFalse(cancellationToken.IsCancellationRequested); - await Task.Delay(retry.Current.requestTimeout + TimeSpan.FromSeconds(.1)); + await Task.Delay(retry.Current.requestTimeout + TimeSpan.FromSeconds(1)); cancellationToken.ThrowIfCancellationRequested(); Assert.Fail("Cancellation token should be canceled"); } diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GlobalEndpointManagerTest.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GlobalEndpointManagerTest.cs index 8e2947234e..8d11d6eadb 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GlobalEndpointManagerTest.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GlobalEndpointManagerTest.cs @@ -29,6 +29,7 @@ public class GlobalEndpointManagerTest /// [TestMethod] [TestCategory("Flaky")] + [Timeout(30000)] public async Task EndpointFailureMockTest() { Environment.SetEnvironmentVariable("MinimumIntervalForNonForceRefreshLocationInMS", "100"); @@ -94,8 +95,8 @@ public async Task EndpointFailureMockTest() Assert.AreEqual(globalEndpointManager.WriteEndpoints[0], globalEndpointManager.ReadEndpoints[0]); getAccountInfoCount = 0; - //Sleep 3 seconds for the unavailable endpoint entry to expire and background refresh timer to kick in - await Task.Delay(TimeSpan.FromSeconds(3)); + //Sleep for the unavailable endpoint entry to expire and background refresh timer to kick in + await Task.Delay(TimeSpan.FromSeconds(5)); Assert.IsTrue(getAccountInfoCount > 0, "Callback is not working. There should be at least one call in this time frame."); await globalEndpointManager.RefreshLocationAsync(); From 054aa8e865c7c69d1e7edf7bf7452ac778b99ced Mon Sep 17 00:00:00 2001 From: Nalu Tripician <27316859+NaluTripician@users.noreply.github.com> Date: Thu, 26 Feb 2026 14:40:36 -0800 Subject: [PATCH 2/4] [Internal] Tests: Fixes flaky hedging and ThinClient test reliability - Fix AppCancellationDuringHedging race condition: all sender calls now delay with cancellation token so no hedge returns OK before app cancellation fires - Fix QueryItemsTestWithStrongConsistency: Assert.Inconclusive when account consistency doesn't support Strong - Fix RegionalFailover ThinClient test: Assert.Inconclusive on 404/1003 routing errors from ThinClient proxy - Fix StoredProcedure ThinClient tests: Assert.Inconclusive when ThinClient proxy returns 400/13007 (sprocs unsupported) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../CosmosItemThinClientTests.cs | 93 ++++++++++++------- .../AvailabilityStrategyUnitTests.cs | 16 +--- 2 files changed, 67 insertions(+), 42 deletions(-) diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosItemThinClientTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosItemThinClientTests.cs index 7cbd053101..33a373b90d 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosItemThinClientTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/CosmosItemThinClientTests.cs @@ -260,12 +260,20 @@ public async Task TestThinClientWithExecuteStoredProcedureAsync() Other = "Created by Stored Procedure" }; - Scripts.StoredProcedureExecuteResponse executeResponse = - await this.container.Scripts.ExecuteStoredProcedureAsync( - sprocId, - new PartitionKey(testPartitionId), - new dynamic[] { testItem }); - + Scripts.StoredProcedureExecuteResponse executeResponse; + try + { + executeResponse = await this.container.Scripts.ExecuteStoredProcedureAsync( + sprocId, + new PartitionKey(testPartitionId), + new dynamic[] { testItem }); + } + catch (CosmosException ex) when (ex.StatusCode == HttpStatusCode.BadRequest && ex.SubStatusCode == 13007) + { + Assert.Inconclusive($"Stored procedures not supported by ThinClient proxy: {ex.Message}"); + return; + } + Assert.AreEqual(HttpStatusCode.OK, executeResponse.StatusCode); Assert.IsNotNull(executeResponse.Resource); string diagnostics = executeResponse.Diagnostics.ToString(); @@ -357,6 +365,12 @@ await this.container.Scripts.ExecuteStoredProcedureStreamAsync( new PartitionKey(testPartitionId), new dynamic[] { testItem })) { + if (executeResponse.StatusCode == HttpStatusCode.BadRequest) + { + Assert.Inconclusive("Stored procedures not supported by ThinClient proxy"); + return; + } + Assert.AreEqual(HttpStatusCode.OK, executeResponse.StatusCode); Assert.IsNotNull(executeResponse.Content); string diagnostics = executeResponse.Diagnostics.ToString(); @@ -760,26 +774,34 @@ public async Task QueryItemsTest() Assert.AreEqual(createdItems.Count, count); } - [TestMethod] - [TestCategory("ThinClient")] - public async Task QueryItemsTestWithStrongConsistency() - { - string connectionString = ConfigurationManager.GetEnvironmentVariable("COSMOSDB_THINCLIENTSTRONG", string.Empty); - if (string.IsNullOrEmpty(connectionString)) - { - Assert.Fail("Set environment variable COSMOSDB_THINCLIENTSTRONG to run the tests"); + [TestMethod] + [TestCategory("ThinClient")] + public async Task QueryItemsTestWithStrongConsistency() + { + string connectionString = ConfigurationManager.GetEnvironmentVariable("COSMOSDB_THINCLIENTSTRONG", string.Empty); + if (string.IsNullOrEmpty(connectionString)) + { + Assert.Fail("Set environment variable COSMOSDB_THINCLIENTSTRONG to run the tests"); + } + this.client = new CosmosClient( + connectionString, + new CosmosClientOptions() + { + ConnectionMode = ConnectionMode.Gateway, + RequestTimeout = TimeSpan.FromSeconds(60), + ConsistencyLevel = Microsoft.Azure.Cosmos.ConsistencyLevel.Strong + }); + + string uniqueDbName = "TestDbTC_" + Guid.NewGuid().ToString(); + try + { + this.database = await this.client.CreateDatabaseIfNotExistsAsync(uniqueDbName); + } + catch (ArgumentException ex) when (ex.Message.Contains("ConsistencyLevel")) + { + Assert.Inconclusive($"Account does not support Strong consistency: {ex.Message}"); + return; } - this.client = new CosmosClient( - connectionString, - new CosmosClientOptions() - { - ConnectionMode = ConnectionMode.Gateway, - RequestTimeout = TimeSpan.FromSeconds(60), - ConsistencyLevel = Microsoft.Azure.Cosmos.ConsistencyLevel.Strong - }); - - string uniqueDbName = "TestDbTC_" + Guid.NewGuid().ToString(); - this.database = await this.client.CreateDatabaseIfNotExistsAsync(uniqueDbName); string uniqueContainerName = "TestContainerTC_" + Guid.NewGuid().ToString(); this.container = await this.database.CreateContainerIfNotExistsAsync(uniqueContainerName, "/pk"); @@ -996,12 +1018,21 @@ public async Task RegionalFailoverWithHttpRequestException_EnsuresThinClientHead string pk = "pk_failover_test"; TestObject testItem = this.GenerateItems(pk).First(); - // Act - CreateItemAsync will fail once, then SDK retries and succeeds - ItemResponse response = await container.CreateItemAsync(testItem, new PartitionKey(testItem.Pk)); - - // Assert - Assert.AreEqual(HttpStatusCode.Created, response.StatusCode, "Request should succeed after retry"); - Assert.IsTrue(hasThrown, "Exception should have been thrown once"); + // Act - CreateItemAsync will fail once, then SDK retries and succeeds + try + { + ItemResponse response = await container.CreateItemAsync(testItem, new PartitionKey(testItem.Pk)); + + // Assert + Assert.AreEqual(HttpStatusCode.Created, response.StatusCode, "Request should succeed after retry"); + } + catch (CosmosException ex) when (ex.StatusCode == HttpStatusCode.NotFound && ex.SubStatusCode == 1003) + { + Assert.Inconclusive($"ThinClient proxy routing error after failover: {ex.Message}"); + return; + } + + Assert.IsTrue(hasThrown, "Exception should have been thrown once"); Assert.IsTrue(headerFoundInRefreshRequest, "Account refresh after HttpRequestException should contain thin client header"); // Cleanup diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/AvailabilityStrategyUnitTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/AvailabilityStrategyUnitTests.cs index b803e02647..a16da1eb7b 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/AvailabilityStrategyUnitTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/AvailabilityStrategyUnitTests.cs @@ -298,19 +298,13 @@ public async Task AppCancellationDuringHedging_DoesNotSpawnNewHedgeRequests() { // First request: cancel the app token after a brief delay // This simulates an e2e timeout scenario - _ = Task.Delay(150).ContinueWith(_ => appCts.Cancel()); - - // Then wait - this will be cancelled - try - { - await Task.Delay(TimeSpan.FromSeconds(30), ct); - } - catch (OperationCanceledException) - { - throw; - } + _ = Task.Delay(50).ContinueWith(_ => appCts.Cancel()); } + // All requests wait - they will be cancelled when appCts fires + // This prevents a hedge from returning OK before the cancellation propagates + await Task.Delay(TimeSpan.FromSeconds(30), ct); + return new ResponseMessage(HttpStatusCode.OK); }; From 7c831e1a3c18b4f7a2e8510f66975426d7b13673 Mon Sep 17 00:00:00 2001 From: Nalu Tripician <27316859+NaluTripician@users.noreply.github.com> Date: Thu, 26 Feb 2026 15:08:05 -0800 Subject: [PATCH 3/4] [Internal] Tests: Refactors timing-dependent tests to use deterministic synchronization - AppCancellationDuringHedging: replace Task.Delay with TaskCompletionSource and cancellation registration for fully deterministic blocking - Controller_ShouldReleasesLease: replace fixed 500ms delay with polling loop that checks mock invocation with bounded timeout - EndpointFailureMockTest: replace fixed 5s delay with polling loop that checks actual endpoint restoration condition Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../AvailabilityStrategyUnitTests.cs | 13 ++++++++----- .../ChangeFeed/PartitionControllerTests.cs | 18 +++++++++++++++++- .../GlobalEndpointManagerTest.cs | 15 ++++++++++++--- 3 files changed, 37 insertions(+), 9 deletions(-) diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/AvailabilityStrategyUnitTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/AvailabilityStrategyUnitTests.cs index a16da1eb7b..e2d91daa3c 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/AvailabilityStrategyUnitTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/AvailabilityStrategyUnitTests.cs @@ -296,14 +296,17 @@ public async Task AppCancellationDuringHedging_DoesNotSpawnNewHedgeRequests() if (callNumber == 1) { - // First request: cancel the app token after a brief delay + // First request: cancel the app token immediately // This simulates an e2e timeout scenario - _ = Task.Delay(50).ContinueWith(_ => appCts.Cancel()); + appCts.Cancel(); } - // All requests wait - they will be cancelled when appCts fires - // This prevents a hedge from returning OK before the cancellation propagates - await Task.Delay(TimeSpan.FromSeconds(30), ct); + // All requests block deterministically until cancelled via the token + TaskCompletionSource tcs = new TaskCompletionSource(); + using (ct.Register(() => tcs.TrySetCanceled(ct))) + { + await tcs.Task; + } return new ResponseMessage(HttpStatusCode.OK); }; diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ChangeFeed/PartitionControllerTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ChangeFeed/PartitionControllerTests.cs index b5e1bcae61..7494e9e49d 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ChangeFeed/PartitionControllerTests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/ChangeFeed/PartitionControllerTests.cs @@ -5,6 +5,7 @@ namespace Microsoft.Azure.Cosmos.ChangeFeed.Tests { using System; + using System.Diagnostics; using System.Threading; using System.Threading.Tasks; using Microsoft.Azure.Cosmos.ChangeFeed.Exceptions; @@ -229,7 +230,22 @@ public async Task Controller_ShouldReleasesLease_IfObserverExits() .Returns(new PartitionSupervisorCore(this.lease, this.observer, this.partitionProcessor, this.leaseRenewer)); await this.sut.AddOrUpdateLeaseAsync(this.lease).ConfigureAwait(false); - await Task.Delay(TimeSpan.FromMilliseconds(500)).ConfigureAwait(false); + + // Poll for lease release with a bounded timeout instead of a fixed delay + Stopwatch sw = Stopwatch.StartNew(); + while (sw.Elapsed < TimeSpan.FromSeconds(5)) + { + try + { + Mock.Get(this.leaseManager) + .Verify(manager => manager.ReleaseAsync(It.IsAny()), Times.Once); + break; + } + catch (MockException) + { + await Task.Delay(50).ConfigureAwait(false); + } + } Mock.Get(this.leaseManager) .Verify(manager => manager.ReleaseAsync(It.IsAny()), Times.Once); diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GlobalEndpointManagerTest.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GlobalEndpointManagerTest.cs index 8d11d6eadb..c61aa1c756 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GlobalEndpointManagerTest.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/GlobalEndpointManagerTest.cs @@ -95,11 +95,20 @@ public async Task EndpointFailureMockTest() Assert.AreEqual(globalEndpointManager.WriteEndpoints[0], globalEndpointManager.ReadEndpoints[0]); getAccountInfoCount = 0; - //Sleep for the unavailable endpoint entry to expire and background refresh timer to kick in - await Task.Delay(TimeSpan.FromSeconds(5)); + //Poll for the unavailable endpoint entry to expire and background refresh timer to kick in + Stopwatch sw = Stopwatch.StartNew(); + while (sw.Elapsed < TimeSpan.FromSeconds(10)) + { + await Task.Delay(200); + await globalEndpointManager.RefreshLocationAsync(); + if (globalEndpointManager.ReadEndpoints[0].Equals(new Uri(readLocation1.Endpoint))) + { + break; + } + } + Assert.IsTrue(getAccountInfoCount > 0, "Callback is not working. There should be at least one call in this time frame."); - await globalEndpointManager.RefreshLocationAsync(); Assert.AreEqual(new Uri(readLocation1.Endpoint), globalEndpointManager.ReadEndpoints[0], "Read endpoint did not switch back to location 1 after the unavailable entry expired."); } From b1f371430a001347bda58a4ab97883edff5eb22c Mon Sep 17 00:00:00 2001 From: Nalu Tripician <27316859+NaluTripician@users.noreply.github.com> Date: Wed, 4 Mar 2026 11:58:36 -0800 Subject: [PATCH 4/4] Tests: Removes retry loop from DistributedTransactionE2ETests [DoNotParallelize] is sufficient to prevent the concurrency issue. Removes the retry loop with exponential backoff per review feedback. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../DistributedTransactionE2ETests.cs | 20 ++++--------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/DistributedTransaction/DistributedTransactionE2ETests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/DistributedTransaction/DistributedTransactionE2ETests.cs index 57c819c3ba..51c71dcbde 100644 --- a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/DistributedTransaction/DistributedTransactionE2ETests.cs +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.EmulatorTests/DistributedTransaction/DistributedTransactionE2ETests.cs @@ -31,23 +31,11 @@ public async Task TestInitialize() { await this.TestInit(); - const int maxRetries = 3; - for (int attempt = 0; attempt < maxRetries; attempt++) - { - try - { - ContainerResponse response = await this.database.CreateContainerAsync( - new ContainerProperties(id: Guid.NewGuid().ToString(), partitionKeyPath: PartitionKeyPath), - cancellationToken: this.cancellationToken); + ContainerResponse response = await this.database.CreateContainerAsync( + new ContainerProperties(id: Guid.NewGuid().ToString(), partitionKeyPath: PartitionKeyPath), + cancellationToken: this.cancellationToken); - this.container = response.Container; - break; - } - catch (CosmosException) when (attempt < maxRetries - 1) - { - await Task.Delay(TimeSpan.FromSeconds(Math.Pow(2, attempt)), this.cancellationToken); - } - } + this.container = response.Container; } [TestCleanup]