From 09a5cadae3b5e81875324c872909e6144965efd3 Mon Sep 17 00:00:00 2001 From: Tom Longhurst <30480171+thomhurst@users.noreply.github.com> Date: Mon, 11 May 2026 12:01:50 +0100 Subject: [PATCH] fix(otel): require two consecutive idle windows in DrainAsync to catch in-transit POSTs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Run 25639584797 surfaced Receiver_DrainAsync_WaitsForLatePostBeforeReturning failing on macOS arm64. Local repro on Windows: 3% failure rate over 100 iterations of the existing test, ~270ms drain elapsed. The race: a request that's been sent over TCP but not yet pulled by HttpListener.GetContextAsync is invisible to both _inflightTasks and TotalRequests. There's no hook between the kernel TCP queue and the accept loop, so a single 250ms idle window can return while a POST is mid-flight. The original test caught this with a wall-clock floor (>=350ms), but that floor itself was fragile and not actually checking the contract. Two changes: - DrainAsync now requires two consecutive idle stable windows (~500ms minimum) before returning. A POST in transit at the start of drain has at least one full window to surface in TotalRequests / inflight. - The test now asserts the actual contract — the late POST's TaskCompletionSource was set before drain returned — instead of a wall-clock floor. The wall-clock floor was a proxy that broke down when scheduling jitter reordered the post relative to drain start. Local: 100/100 passes after the fix. --- .../OtlpReceiverIngestionTests.cs | 21 +++++++++---------- TUnit.OpenTelemetry/Receiver/OtlpReceiver.cs | 17 ++++++++++++++- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/TUnit.OpenTelemetry.Tests/OtlpReceiverIngestionTests.cs b/TUnit.OpenTelemetry.Tests/OtlpReceiverIngestionTests.cs index 7973e7634d..0f5e6d672f 100644 --- a/TUnit.OpenTelemetry.Tests/OtlpReceiverIngestionTests.cs +++ b/TUnit.OpenTelemetry.Tests/OtlpReceiverIngestionTests.cs @@ -202,28 +202,27 @@ public async Task Receiver_DrainAsync_WaitsForLatePostBeforeReturning() // Simulate a SUT exporter that flushes a couple hundred ms after the test logic // would finish — without DrainAsync, AspireFixture would tear down the AppHost // and the late POST would fail / be dropped. + var latePostCompleted = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); var latePost = Task.Run(async () => { await Task.Delay(TimeSpan.FromMilliseconds(200)); using var client = new HttpClient(); using var content = new ByteArrayContent(Array.Empty()); await client.PostAsync($"http://127.0.0.1:{receiver.Port}/v1/traces", content); + latePostCompleted.SetResult(true); }); - var drainStart = DateTime.UtcNow; await receiver.DrainAsync(TimeSpan.FromSeconds(3)); - var drainElapsed = DateTime.UtcNow - drainStart; - - await latePost; + // Asserts the real contract directly: drain returned only after the late POST + // had been issued and acknowledged. Wall-clock floors (the previous approach) + // failed on macOS arm64 because Task.Delay scheduling jitter could push the + // late POST inside the synchronous prefix of DrainAsync, making the elapsed + // measurement misrepresent the actual invariant the drain must hold. + await Assert.That(latePostCompleted.Task.IsCompleted).IsTrue(); await Assert.That(receiver.Diagnostics.TracesRequests).IsEqualTo(1); - // The drain must have waited past the first 250ms stable window — otherwise the - // 200ms-delayed POST would have landed after the drain returned. Lower bound is - // 350ms to leave headroom for CI scheduling jitter; the real invariant is "drain - // didn't return at ~250ms". - await Assert.That(drainElapsed).IsGreaterThanOrEqualTo(TimeSpan.FromMilliseconds(350)); - // And it must respect the cap — no point waiting indefinitely once quiet. - await Assert.That(drainElapsed).IsLessThan(TimeSpan.FromSeconds(3)); + + await latePost; } [Test] diff --git a/TUnit.OpenTelemetry/Receiver/OtlpReceiver.cs b/TUnit.OpenTelemetry/Receiver/OtlpReceiver.cs index 5403c904a8..730699b799 100644 --- a/TUnit.OpenTelemetry/Receiver/OtlpReceiver.cs +++ b/TUnit.OpenTelemetry/Receiver/OtlpReceiver.cs @@ -155,6 +155,14 @@ public async Task DrainAsync(TimeSpan? window = null, CancellationToken cancella var totalWindow = window ?? DefaultDrainWindow; var clock = Stopwatch.StartNew(); + // A request that's been sent over TCP but not yet pulled by GetContextAsync is + // invisible to both _inflightTasks and _diagnostics.TotalRequests — there's no + // hook between kernel TCP queue and HttpListener's accept loop. A single 250ms + // idle window can therefore return while a request is still on the wire. Require + // two consecutive idle windows (~500ms) so an in-transit POST has a chance to + // surface before drain declares quiet. + var consecutiveIdleWindows = 0; + while (!cancellationToken.IsCancellationRequested) { var beforeCount = Volatile.Read(ref _diagnostics.TotalRequests); @@ -188,7 +196,14 @@ await Task.Delay(stableFor < remaining ? stableFor : remaining, cancellationToke var afterCount = Volatile.Read(ref _diagnostics.TotalRequests); if (afterCount == beforeCount && _inflightTasks.IsEmpty) { - return; + if (++consecutiveIdleWindows >= 2) + { + return; + } + } + else + { + consecutiveIdleWindows = 0; } } }