diff --git a/TUnit.Aspire.Core/AspireFixture.cs b/TUnit.Aspire.Core/AspireFixture.cs index 199ef8bbfa..739dfe8e2e 100644 --- a/TUnit.Aspire.Core/AspireFixture.cs +++ b/TUnit.Aspire.Core/AspireFixture.cs @@ -1,10 +1,12 @@ using System.Collections.Concurrent; +using System.Collections.Immutable; using System.Diagnostics; using System.Text; using Aspire.Hosting; using Aspire.Hosting.ApplicationModel; using Aspire.Hosting.Testing; using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Diagnostics.HealthChecks; using Microsoft.Extensions.Hosting; using TUnit.Core; using TUnit.Core.Interfaces; @@ -30,6 +32,14 @@ public class AspireFixture : IAsyncInitializer, IAsyncDisposable private DistributedApplication? _app; private OtlpReceiver? _otlpReceiver; + // Captured state-transition timeline (recorded by the background monitor), folded into the + // exception when startup fails so the failure tells the whole story in one place. Guarded by + // a lock because the monitor records into it while a failing path reads it (the monitor is + // still running when diagnostics are built — it's stopped in InitializeAsync's finally). + private readonly List _timeline = []; + private readonly object _timelineGate = new(); + private Stopwatch? _timelineClock; + /// /// The running Aspire distributed application. /// @@ -81,7 +91,7 @@ public IAsyncDisposable WatchResourceLogs(string resourceName) { foreach (var line in batch) { - testContext.Output.WriteLine($"[{resourceName}] {line}"); + testContext.Output.WriteLine($"[{resourceName}] {line.Content}"); } } } @@ -97,6 +107,7 @@ public IAsyncDisposable WatchResourceLogs(string resourceName) // --- Configuration hooks (virtual) --- private static readonly Stream StdErr = Console.OpenStandardError(); + private static readonly object StdErrGate = new(); /// /// Logs progress messages during initialization. Override to route to a custom logger. @@ -107,8 +118,14 @@ public IAsyncDisposable WatchResourceLogs(string resourceName) protected virtual void LogProgress(string message) { var bytes = Encoding.UTF8.GetBytes($"[Aspire] {message}{Environment.NewLine}"); - StdErr.Write(bytes, 0, bytes.Length); - StdErr.Flush(); + + // The background resource monitor and the main init thread both log concurrently, and + // separate fixture instances share this static stream — lock so lines never splice. + lock (StdErrGate) + { + StdErr.Write(bytes, 0, bytes.Length); + StdErr.Flush(); + } } /// @@ -262,52 +279,55 @@ public virtual async Task InitializeAsync() var resourceList = string.Join(", ", model.Resources.Select(r => r.Name)); LogProgress($"Starting application with resources: [{resourceList}]"); - // Monitor resource state changes in the background during startup. - // This provides real-time visibility into container health check failures, - // SSL errors, etc. that would otherwise be invisible during StartAsync(). + // Monitor resource state changes in the background, covering BOTH startup and the + // resource-wait phase, so the captured timeline includes health-check-wait hangs. + // This also provides real-time visibility into container health check failures, SSL + // errors, etc. that would otherwise be invisible until a timeout fires. + _timelineClock = Stopwatch.StartNew(); using var monitorCts = new CancellationTokenSource(); var monitorTask = MonitorResourceEventsAsync(_app, monitorCts.Token); + var notificationService = _app.Services.GetRequiredService(); - using var startCts = new CancellationTokenSource(ResourceTimeout); try { - await _app.StartAsync(startCts.Token); - } - catch (OperationCanceledException) when (startCts.IsCancellationRequested) - { - await StopMonitorAsync(monitorCts, monitorTask); - - // Collect resource logs so the timeout error shows WHY startup hung - var sb = new StringBuilder(); - sb.Append($"Timed out after {ResourceTimeout.TotalSeconds:0}s waiting for the Aspire application to start."); - - await AppendResourceLogsAsync(sb, _app, model.Resources.Select(r => r.Name)); - - throw new TimeoutException(sb.ToString()); - } - - await StopMonitorAsync(monitorCts, monitorTask); + using (var startCts = new CancellationTokenSource(ResourceTimeout)) + { + try + { + await _app.StartAsync(startCts.Token); + } + catch (OperationCanceledException) when (startCts.IsCancellationRequested) + { + var headline = $"Timed out after {ResourceTimeout.TotalSeconds:0}s waiting for the Aspire application to start."; + throw new TimeoutException( + await BuildDiagnosticsAndAttachAsync(_app, notificationService, headline, + model.Resources.Select(r => r.Name).ToList())); + } + } - LogProgress($"Application started in {sw.Elapsed.TotalSeconds:0.0}s. Waiting for resources (timeout: {ResourceTimeout.TotalSeconds:0}s, behavior: {WaitBehavior})..."); - using var cts = new CancellationTokenSource(ResourceTimeout); + LogProgress($"Application started in {sw.Elapsed.TotalSeconds:0.0}s. Waiting for resources (timeout: {ResourceTimeout.TotalSeconds:0}s, behavior: {WaitBehavior})..."); - try - { - await WaitForResourcesAsync(_app, cts.Token); - LogProgress("All resources ready."); + using (var cts = new CancellationTokenSource(ResourceTimeout)) + { + try + { + await WaitForResourcesAsync(_app, cts.Token); + LogProgress("All resources ready."); + } + catch (OperationCanceledException) when (cts.IsCancellationRequested) + { + // Fallback for custom WaitForResourcesAsync overrides that don't use the default + // fail-fast helper (which throws its own rich TimeoutException first). + var headline = $"Timed out after {ResourceTimeout.TotalSeconds:0}s waiting for Aspire resources to be ready (wait behavior: {WaitBehavior})."; + throw new TimeoutException( + await BuildDiagnosticsAndAttachAsync(_app, notificationService, headline, + model.Resources.Select(r => r.Name).ToList())); + } + } } - catch (OperationCanceledException) when (cts.IsCancellationRequested) + finally { - // Fallback for custom WaitForResourcesAsync overrides that don't use the - // default fail-fast helper. The default implementation throws its own - // TimeoutException with resource logs before this catch is reached. - var resourceNames = string.Join(", ", model.Resources.Select(r => $"'{r.Name}'")); - - throw new TimeoutException( - $"Timed out after {ResourceTimeout.TotalSeconds:0}s waiting for Aspire resources to be ready. " + - $"Resources: [{resourceNames}]. " + - $"Wait behavior: {WaitBehavior}. " + - $"Consider increasing ResourceTimeout, checking resource health, or using WatchResourceLogs() to diagnose startup issues."); + await StopMonitorAsync(monitorCts, monitorTask); } } @@ -484,7 +504,10 @@ private async Task MonitorResourceEventsAsync(DistributedApplication app, Cancel try { var notificationService = app.Services.GetRequiredService(); - var trackedStates = new ConcurrentDictionary(); + + // Single-consumer loop, so plain dictionaries are safe here. + var trackedStates = new Dictionary(); + var trackedHealth = new Dictionary(); await foreach (var evt in notificationService.WatchAsync(cancellationToken)) { @@ -496,13 +519,28 @@ private async Task MonitorResourceEventsAsync(DistributedApplication app, Cancel continue; } - // Only log when state actually changes var previousState = trackedStates.GetValueOrDefault(name); - if (state != previousState) + var stateChanged = state != previousState; + + var healthSignature = HealthSignature(evt.Snapshot.HealthReports); + var healthChanged = healthSignature != trackedHealth.GetValueOrDefault(name); + + // Only log to stderr when the state text actually changes (unchanged behavior). + if (stateChanged) { trackedStates[name] = state; LogProgress($" [{name}] {previousState ?? "unknown"} -> {state}"); } + + // Record a timeline entry on a state OR health transition so the captured story + // includes health-check deltas (HealthStatus is null off-Running, so diff reports). + if ((stateChanged || healthChanged) && _timelineClock is { } clock) + { + trackedHealth[name] = healthSignature; + RecordTransition(new StateTransition( + clock.Elapsed, name, previousState ?? "(none)", state, + healthChanged ? $"health: {healthSignature}" : null)); + } } } catch (OperationCanceledException) @@ -516,6 +554,51 @@ private async Task MonitorResourceEventsAsync(DistributedApplication app, Cancel } } + private const int TimelineHead = 80; + private const int TimelineTail = 120; + + private void RecordTransition(StateTransition transition) + { + lock (_timelineGate) + { + _timeline.Add(transition); + + // Bound the timeline so a flapping resource can't grow it without limit, but keep + // BOTH the earliest transitions (the startup story — most diagnostic) and the most + // recent ones, dropping only the churn in the middle. Trim infrequently (only past + // double the kept size). + if (_timeline.Count > (TimelineHead + TimelineTail) * 2) + { + TrimTimeline(_timeline, TimelineHead, TimelineTail); + } + } + } + + /// Drops the middle of an over-long timeline, preserving the first + /// and last entries. Pure (operates on the passed list) for unit testing. + internal static void TrimTimeline(List timeline, int head, int tail) + { + var excess = timeline.Count - (head + tail); + if (excess > 0) + { + timeline.RemoveRange(head, excess); + } + } + + private static string HealthSignature(ImmutableArray reports) + { + if (reports.IsDefaultOrEmpty) + { + return "none"; + } + + // Per-check signature (not just the worst aggregate) so two checks flipping in opposite + // directions still register as a change, and the timeline shows which check moved. + return string.Join(", ", reports + .OrderBy(r => r.Name, StringComparer.Ordinal) + .Select(r => $"{r.Name}:{r.Status?.ToString() ?? "Pending"}")); + } + private static async Task StopMonitorAsync(CancellationTokenSource monitorCts, Task monitorTask) { await monitorCts.CancelAsync(); @@ -530,24 +613,6 @@ private static async Task StopMonitorAsync(CancellationTokenSource monitorCts, T } } - /// - /// Appends resource logs for the given resource names to a . - /// Only includes resources that have logs available. - /// - private async Task AppendResourceLogsAsync(StringBuilder sb, DistributedApplication app, IEnumerable resourceNames) - { - foreach (var name in resourceNames) - { - var logs = await CollectResourceLogsAsync(app, name); - if (logs != " (no logs available)") - { - sb.AppendLine().AppendLine(); - sb.AppendLine($"--- {name} logs ---"); - sb.Append(logs); - } - } - } - /// /// Waits for all named resources to reach the desired state, while simultaneously /// watching for any resource entering FailedToStart. If a resource fails, throws @@ -620,10 +685,14 @@ private async Task WaitForResourcesWithFailFastAsync( LogProgress($" Resource '{name}' is {targetState} ({readyResources.Count}/{visibleNames.Count})"); })); - // Fail-fast path: complete as soon as ANY resource enters FailedToStart + // Fail-fast path: complete as soon as ANY resource enters a genuine failure state. + // Watch a non-zero Exited too — a container that crash-exits cleanly never reports + // FailedToStart, so without it we would wait the full timeout for an already-dead + // resource. A clean (code 0) exit is excluded so successful one-shot resources + // (migration runners, seeders) aren't mis-reported as failures (see IsFailureState). var failureTasks = visibleNames.Select(async name => { - await notificationService.WaitForResourceAsync(name, KnownResourceStates.FailedToStart, failureCts.Token); + await notificationService.WaitForResourceAsync(name, evt => IsFailureState(evt.Snapshot), failureCts.Token); return name; }).ToList(); var anyFailureTask = Task.WhenAny(failureTasks); @@ -637,13 +706,11 @@ private async Task WaitForResourcesWithFailFastAsync( { failureCts.Cancel(); var failedName = await await anyFailureTask; - var logs = await CollectResourceLogsAsync(app, failedName); + var headline = $"Resource '{failedName}' failed to start."; throw new InvalidOperationException( - $"Resource '{failedName}' failed to start." + - $"{Environment.NewLine}{Environment.NewLine}" + - $"--- {failedName} logs ---{Environment.NewLine}" + - logs); + await BuildDiagnosticsAndAttachAsync( + app, notificationService, headline, [failedName], readyResources.ToArray())); } // All resources are ready - cancel the failure watchers @@ -652,50 +719,34 @@ private async Task WaitForResourcesWithFailFastAsync( } catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) { - // Timeout - report which resources are ready vs. still pending, with logs + // Timeout - diagnose each pending resource (state, exit code, health, dependencies) + // and attach the full timeline + untruncated logs as an artifact. failureCts.Cancel(); var readySet = new HashSet(readyResources); var pending = visibleNames.Where(n => !readySet.Contains(n)).ToList(); - var sb = new StringBuilder(); - sb.Append("Resources not ready: ["); - sb.AppendJoin(", ", pending.Select(n => $"'{n}'")); - sb.Append(']'); - - if (readySet.Count > 0) - { - sb.Append(". Resources ready: ["); - sb.AppendJoin(", ", readySet.Select(n => $"'{n}'")); - sb.Append(']'); - } - - foreach (var name in pending) - { - var logs = await CollectResourceLogsAsync(app, name); - sb.AppendLine().AppendLine(); - sb.AppendLine($"--- {name} logs ---"); - sb.Append(logs); - } - - throw new TimeoutException(sb.ToString()); + var headline = $"Timed out after {ResourceTimeout.TotalSeconds:0}s waiting for resource(s) to become {targetState}."; + throw new TimeoutException( + await BuildDiagnosticsAndAttachAsync(app, notificationService, headline, pending, readySet)); } } /// - /// Collects recent log lines from a resource via the . - /// Returns the last lines, or "(no logs available)" if none. - /// Uses a short timeout to avoid hanging if the log service is unresponsive. + /// Collects buffered log lines from a resource via the . + /// Returns the raw line content (error/stderr lines prefixed with E>), or an empty + /// list if none are available. Uses a short timeout to avoid hanging if the log service is + /// unresponsive; replays the buffered + /// backlog first, so logs from an already-exited resource are still captured. /// - private async Task CollectResourceLogsAsync( + private static async Task> CollectResourceLogLinesAsync( DistributedApplication app, - string resourceName, - int maxLines = 20) + string resourceName) { var loggerService = app.Services.GetRequiredService(); var lines = new List(); - using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(2)); + using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(5)); try { @@ -704,27 +755,16 @@ private async Task CollectResourceLogsAsync( { foreach (var line in batch) { - lines.Add($" {line}"); + lines.Add(line.IsErrorMessage ? $"E> {line.Content}" : line.Content); } } } catch (OperationCanceledException) { - // Expected - we use a short timeout to collect buffered logs - } - - if (lines.Count == 0) - { - return " (no logs available)"; - } - - if (lines.Count > maxLines) - { - return $" ... ({lines.Count - maxLines} earlier lines omitted){Environment.NewLine}" + - string.Join(Environment.NewLine, lines.Skip(lines.Count - maxLines)); + // Expected - we use a short timeout to collect buffered logs. } - return string.Join(Environment.NewLine, lines); + return lines; } /// @@ -785,6 +825,535 @@ private List GetWaitableResourceNames(DistributedApplicationModel model) return waitable; } + // --- Diagnostics --- + + /// Why a resource failed or never became ready — used to attach an actionable hint. + internal enum ResourceFailureClass + { + Unknown, + ContainerRuntimeDown, + ImagePullFailure, + PortInUse, + OutOfMemory, + NonZeroExit, + CrashedNoCode, + HealthCheckFailing, + NeverStarted, + } + + /// A single resource's classified failure state plus a one-line detail and hint. + internal readonly record struct ResourceDiagnosis( + string Name, + string State, + int? ExitCode, + ResourceFailureClass Class, + string? Detail, + string? Hint); + + /// One recorded state/health transition in the startup timeline. + internal readonly record struct StateTransition( + TimeSpan Elapsed, + string Resource, + string From, + string To, + string? HealthDelta); + + private static bool StateEquals(string state, string known) + => string.Equals(state, known, StringComparison.OrdinalIgnoreCase); + + private static bool IsTerminalState(string state) + => StateEquals(state, KnownResourceStates.FailedToStart) + || StateEquals(state, KnownResourceStates.Exited) + || StateEquals(state, KnownResourceStates.Finished); + + private static bool IsPendingState(string state) + => StateEquals(state, KnownResourceStates.Starting) + || StateEquals(state, KnownResourceStates.Waiting) + || StateEquals(state, KnownResourceStates.NotStarted) + || StateEquals(state, KnownResourceStates.Building); + + private static bool IsReadyState(string state) + => StateEquals(state, KnownResourceStates.Running) + || StateEquals(state, KnownResourceStates.Finished); + + /// + /// Whether a snapshot represents a genuine startup failure. FailedToStart always counts; + /// Exited counts only with a non-zero exit code, so a one-shot resource (migration runner, + /// seeder) that exits cleanly with code 0 isn't mis-reported as a failure. + /// + private static bool IsFailureState(CustomResourceSnapshot snapshot) + { + var state = snapshot.State?.Text; + if (state is null) + { + return false; + } + + if (StateEquals(state, KnownResourceStates.FailedToStart)) + { + return true; + } + + return StateEquals(state, KnownResourceStates.Exited) && snapshot.ExitCode is not (null or 0); + } + + /// Scans recent log lines for high-confidence failure signatures. Heuristic; a miss + /// just falls through to state-based classification. + internal static ResourceFailureClass? ScanLogSignatures(IReadOnlyList? lines) + { + if (lines is null) + { + return null; + } + + const StringComparison Ci = StringComparison.OrdinalIgnoreCase; + + foreach (var line in lines) + { + if (line.Contains("cannot connect to the docker daemon", Ci) + || line.Contains("is the docker daemon running", Ci) + || line.Contains("error during connect", Ci)) + { + return ResourceFailureClass.ContainerRuntimeDown; + } + + if (line.Contains("address already in use", Ci) + || line.Contains("port is already allocated", Ci) + || line.Contains("bind: address already in use", Ci)) + { + return ResourceFailureClass.PortInUse; + } + + if (line.Contains("pull access denied", Ci) + || line.Contains("manifest unknown", Ci) + || line.Contains("repository does not exist", Ci) + || line.Contains("not found: manifest", Ci) + || line.Contains("no such image", Ci)) + { + return ResourceFailureClass.ImagePullFailure; + } + } + + return null; + } + + /// Pure classification used by ; separated for unit testing. + internal static ResourceFailureClass Classify( + string state, int? exitCode, bool runningButUnhealthy, IReadOnlyList? logLines) + { + if (exitCode == 137) + { + return ResourceFailureClass.OutOfMemory; + } + + if (ScanLogSignatures(logLines) is { } signature) + { + return signature; + } + + if (StateEquals(state, KnownResourceStates.RuntimeUnhealthy)) + { + return ResourceFailureClass.ContainerRuntimeDown; + } + + if (runningButUnhealthy) + { + return ResourceFailureClass.HealthCheckFailing; + } + + if (IsTerminalState(state)) + { + return exitCode is int code && code != 0 + ? ResourceFailureClass.NonZeroExit + : ResourceFailureClass.CrashedNoCode; + } + + if (IsPendingState(state)) + { + return ResourceFailureClass.NeverStarted; + } + + return ResourceFailureClass.Unknown; + } + + /// Maps a failure class to a single actionable hint, or null when nothing specific applies. + internal static string? HintFor(ResourceFailureClass cls) => cls switch + { + ResourceFailureClass.ContainerRuntimeDown => + "The container runtime is not reachable. Is Docker/Podman running? Start it (e.g. Docker Desktop, or `sudo systemctl start docker`) and retry.", + ResourceFailureClass.ImagePullFailure => + "The container image could not be pulled. Check the image name/tag, registry authentication (`docker login`), and network access.", + ResourceFailureClass.PortInUse => + "A required port is already in use. Stop the process holding it, or avoid pinning host ports in tests so Aspire can pick a free one.", + ResourceFailureClass.OutOfMemory => + "Exit code 137 means the container was killed (OOM / SIGKILL). Increase the container memory limit or the resources allocated to Docker.", + ResourceFailureClass.NonZeroExit => + "The process exited with a non-zero code on startup. The logs usually show the cause (bad config, missing connection string, failed migration).", + ResourceFailureClass.HealthCheckFailing => + "The container is Running but its health check never passed. It started but isn't ready yet — check the health endpoint, slow startup, or raise ResourceTimeout.", + ResourceFailureClass.NeverStarted => + "The resource never reached Running. It is still booting or blocked on a dependency — check the logs and the dependency chain.", + ResourceFailureClass.CrashedNoCode => + "The resource entered a failed state without an exit code. Check the logs and the container runtime.", + _ => null, + }; + + /// One-line human description of a resource's failure state for the concise exception message. + internal static string DescribeState(in ResourceDiagnosis d) => d.Class switch + { + ResourceFailureClass.OutOfMemory => $"{d.State}, exit code {d.ExitCode} (OOM / SIGKILL)", + ResourceFailureClass.NonZeroExit => $"{d.State}, exit code {d.ExitCode}", + ResourceFailureClass.CrashedNoCode => $"{d.State} (no exit code)", + ResourceFailureClass.HealthCheckFailing => d.Detail is { Length: > 0 } health + ? $"Running but not Healthy ({health})" + : "Running but not Healthy", + ResourceFailureClass.NeverStarted => d.Detail is { Length: > 0 } dependency + ? $"{d.State}, waiting on {dependency}" + : $"still {d.State} (never reached Running)", + ResourceFailureClass.ContainerRuntimeDown => $"{d.State} — container runtime unreachable", + ResourceFailureClass.ImagePullFailure => $"{d.State} — image pull failed", + ResourceFailureClass.PortInUse => $"{d.State} — port already in use", + _ => d.State, + }; + + /// Reads a resource's current Aspire snapshot and classifies why it failed or is not ready. + private ResourceDiagnosis DiagnoseResource( + ResourceNotificationService notificationService, + string name, + ResourceEvent? ev, + IReadOnlyList? recentLogLines) + { + var state = "unknown"; + int? exitCode = null; + var runningButUnhealthy = false; + string? detail = null; + + if (ev is not null) + { + var snapshot = ev.Snapshot; + state = snapshot.State?.Text ?? "unknown"; + exitCode = snapshot.ExitCode; + + if (StateEquals(state, KnownResourceStates.Running) + && FirstFailingHealthReport(snapshot.HealthReports) is { } report) + { + runningButUnhealthy = true; + detail = SummarizeHealthReport(report); + } + else if (StateEquals(state, KnownResourceStates.Waiting)) + { + detail = SummarizeDependencies(notificationService, ev.Resource); + } + } + + var cls = Classify(state, exitCode, runningButUnhealthy, recentLogLines); + return new ResourceDiagnosis(name, state, exitCode, cls, detail, HintFor(cls)); + } + + private static HealthReportSnapshot? FirstFailingHealthReport(ImmutableArray reports) + { + if (reports.IsDefaultOrEmpty) + { + return null; + } + + foreach (var report in reports) + { + if (report.Status != HealthStatus.Healthy) + { + return report; + } + } + + return null; + } + + private static string SummarizeHealthReport(HealthReportSnapshot report) + { + var status = report.Status?.ToString() ?? "Pending"; + var reason = FirstLine(report.Description) ?? FirstLine(report.ExceptionText); + return reason is null ? $"{report.Name} {status}" : $"{report.Name} {status}: {reason}"; + } + + private static string? SummarizeDependencies(ResourceNotificationService notificationService, IResource resource) + { + if (!resource.TryGetAnnotationsOfType(out var waits)) + { + return null; + } + + // Surface the first dependency that is actually blocking (not yet ready), rather than + // just the first declared one — a resource can wait on several. The full artifact lists + // them all via AppendDependencyChain. + foreach (var wait in waits) + { + var depName = wait.Resource.Name; + if (notificationService.TryGetCurrentState(depName, out var dep) && dep is not null) + { + var depState = dep.Snapshot.State?.Text ?? "unknown"; + if (IsReadyState(depState)) + { + continue; + } + + return dep.Snapshot.ExitCode is int code + ? $"'{depName}' ({depState}, exit code {code})" + : $"'{depName}' ({depState})"; + } + + return $"'{depName}'"; + } + + return null; + } + + private static string? FirstLine(string? text) + { + if (string.IsNullOrWhiteSpace(text)) + { + return null; + } + + var idx = text.IndexOfAny(['\r', '\n']); + var line = (idx >= 0 ? text[..idx] : text).Trim(); + return line.Length == 0 ? null : line; + } + + private string FormatTimeline() + { + lock (_timelineGate) + { + return FormatTimeline(_timeline.ToArray()); + } + } + + /// Renders the recorded transitions as a relative-time timeline; "" only when empty. + /// A single transition (e.g. an immediate jump to FailedToStart) is still worth showing. + internal static string FormatTimeline(StateTransition[] transitions) + { + if (transitions.Length == 0) + { + return string.Empty; + } + + var origin = transitions[0].Elapsed; + var sb = new StringBuilder(); + + foreach (var t in transitions) + { + var seconds = (t.Elapsed - origin).TotalSeconds; + sb.Append($" +{seconds,5:0.0}s [{t.Resource}] {t.From} -> {t.To}"); + if (t.HealthDelta is { Length: > 0 }) + { + sb.Append($" ({t.HealthDelta})"); + } + sb.AppendLine(); + } + + return sb.ToString(); + } + + /// + /// Builds a concise diagnosis for the exception message, writes the full timeline + untruncated + /// resource logs to an attached artifact, and returns the concise text plus an artifact pointer. + /// + private async Task BuildDiagnosticsAndAttachAsync( + DistributedApplication app, + ResourceNotificationService notificationService, + string headline, + IReadOnlyList problemResources, + IReadOnlyCollection? readyResources = null) + { + var concise = new StringBuilder().Append(headline); + var full = new StringBuilder().AppendLine(headline); + + // Collect every resource's logs in parallel — each has its own short timeout, so doing + // them sequentially would compound the delay on an already-failed test's teardown. + var collected = await Task.WhenAll(problemResources.Select(async name => + (name, logLines: (IReadOnlyList)await CollectResourceLogLinesAsync(app, name)))); + + foreach (var (name, logLines) in collected) + { + // Read the snapshot once and thread it through both the diagnosis and the rendering, + // rather than each re-querying live state mid-render. + notificationService.TryGetCurrentState(name, out var ev); + var diagnosis = DiagnoseResource(notificationService, name, ev, logLines); + + concise.AppendLine().Append($" {name}: {DescribeState(diagnosis)}"); + if (diagnosis.Hint is not null) + { + concise.AppendLine().Append($" Hint: {diagnosis.Hint}"); + } + + AppendResourceDiagnosticBlock(full, notificationService, name, ev, diagnosis, logLines); + } + + if (readyResources is { Count: > 0 }) + { + var readyList = string.Join(", ", readyResources.Select(n => $"'{n}'")); + concise.AppendLine().Append($" Ready: [{readyList}]"); + full.AppendLine().AppendLine($"Ready resources: [{readyList}]"); + } + + var timeline = FormatTimeline(); + if (timeline.Length > 0) + { + full.AppendLine().AppendLine("--- startup timeline ---").Append(timeline); + } + + var pointer = await TryWriteDiagnosticsArtifactAsync(full.ToString()); + if (pointer is not null) + { + concise.AppendLine().AppendLine().Append(pointer); + } + + return concise.ToString(); + } + + private void AppendResourceDiagnosticBlock( + StringBuilder sb, + ResourceNotificationService notificationService, + string name, + ResourceEvent? ev, + in ResourceDiagnosis diagnosis, + IReadOnlyList logLines) + { + sb.AppendLine().AppendLine($"--- {name} diagnostics ---"); + sb.AppendLine($" State: {diagnosis.State}"); + if (diagnosis.ExitCode is int code) + { + sb.AppendLine($" Exit code: {code}"); + } + if (diagnosis.Hint is not null) + { + sb.AppendLine($" Hint: {diagnosis.Hint}"); + } + + if (ev is not null) + { + AppendHealthReports(sb, ev.Snapshot.HealthReports); + AppendDependencyChain(sb, notificationService, ev.Resource); + } + + sb.AppendLine($" Logs ({(logLines.Count == 0 ? "none" : logLines.Count + " line(s)")}):"); + if (logLines.Count == 0) + { + sb.AppendLine(" (no logs available)"); + } + else + { + foreach (var line in logLines) + { + sb.Append(" ").AppendLine(line); + } + } + } + + private static void AppendHealthReports(StringBuilder sb, ImmutableArray reports) + { + if (reports.IsDefaultOrEmpty) + { + return; + } + + var failing = reports.Where(r => r.Status != HealthStatus.Healthy).ToList(); + if (failing.Count == 0) + { + return; + } + + sb.AppendLine(" Health checks:"); + foreach (var report in failing) + { + var status = report.Status?.ToString() ?? "Pending"; + sb.Append($" [{report.Name}] {status}"); + if (FirstLine(report.Description) is { } description) + { + sb.Append($" — {description}"); + } + sb.AppendLine(); + + if (!string.IsNullOrWhiteSpace(report.ExceptionText)) + { + foreach (var line in report.ExceptionText.Split('\n')) + { + sb.Append(" ").AppendLine(line.TrimEnd('\r')); + } + } + } + } + + private static void AppendDependencyChain(StringBuilder sb, ResourceNotificationService notificationService, IResource resource) + { + if (!resource.TryGetAnnotationsOfType(out var waits)) + { + return; + } + + var list = waits.ToList(); + if (list.Count == 0) + { + return; + } + + sb.AppendLine(" Waiting on dependencies:"); + foreach (var wait in list) + { + var depName = wait.Resource.Name; + string description; + if (notificationService.TryGetCurrentState(depName, out var dep) && dep is not null) + { + var depState = dep.Snapshot.State?.Text ?? "unknown"; + description = dep.Snapshot.ExitCode is int code ? $"{depState}, exit code {code}" : depState; + } + else + { + description = "unknown"; + } + + sb.AppendLine($" - {depName} [{wait.WaitType}] -> {description}"); + } + } + + private async Task TryWriteDiagnosticsArtifactAsync(string content) + { + try + { + var directory = TestContext.OutputDirectory ?? Path.GetTempPath(); + // Include a GUID so two fixtures of the same TAppHost timing out in the same + // millisecond can't race to the same path and overwrite each other's artifact. + var fileName = $"aspire-diagnostics-{typeof(TAppHost).Name}-{DateTime.UtcNow:yyyyMMdd-HHmmss-fff}-{Guid.NewGuid():N}.log"; + var path = Path.Combine(directory, fileName); + + await File.WriteAllTextAsync(path, content); + + var artifact = new Artifact + { + File = new FileInfo(path), + DisplayName = $"Aspire startup diagnostics ({typeof(TAppHost).Name})", + Description = "Full Aspire resource startup timeline and logs.", + }; + + if (TestContext.Current is { } testContext) + { + testContext.Output.AttachArtifact(artifact); + } + else if (TestSessionContext.Current is { } sessionContext) + { + sessionContext.AddArtifact(artifact); + } + + return $"Full startup diagnostics (timeline + untruncated logs) written to: {path}"; + } + catch (Exception ex) + { + // Diagnostics must never mask the underlying startup failure — surface the + // artifact-write problem via the progress log but otherwise carry on. + LogProgress($"(failed to write diagnostics artifact: {ex.Message})"); + return null; + } + } + private sealed class ResourceLogWatcher(CancellationTokenSource cts) : IAsyncDisposable { public ValueTask DisposeAsync() diff --git a/TUnit.Aspire.Tests/AspireDiagnosticsTests.cs b/TUnit.Aspire.Tests/AspireDiagnosticsTests.cs new file mode 100644 index 0000000000..8e6c4a60c7 --- /dev/null +++ b/TUnit.Aspire.Tests/AspireDiagnosticsTests.cs @@ -0,0 +1,176 @@ +using TUnit.Assertions; +using TUnit.Assertions.Extensions; +using TUnit.Core; +using static TUnit.Aspire.AspireFixture; + +namespace TUnit.Aspire.Tests; + +/// +/// Unit tests for the Aspire startup-diagnosis helpers (no Docker required). These cover the pure +/// classification, hint, description, and timeline-formatting logic that turns an opaque startup +/// failure into an actionable message. The snapshot-reading paths are exercised by the +/// Docker-backed integration tests. +/// +public class AspireDiagnosticsTests +{ + [Test] + public async Task Classify_ExitCode137_IsOutOfMemory() + => await Assert.That(Classify("Exited", 137, runningButUnhealthy: false, logLines: null)) + .IsEqualTo(ResourceFailureClass.OutOfMemory); + + [Test] + public async Task Classify_TerminalWithNonZeroExit_IsNonZeroExit() + => await Assert.That(Classify("FailedToStart", 1, runningButUnhealthy: false, logLines: null)) + .IsEqualTo(ResourceFailureClass.NonZeroExit); + + [Test] + public async Task Classify_TerminalWithNoExitCode_IsCrashedNoCode() + => await Assert.That(Classify("FailedToStart", null, runningButUnhealthy: false, logLines: null)) + .IsEqualTo(ResourceFailureClass.CrashedNoCode); + + [Test] + public async Task Classify_RunningButUnhealthy_IsHealthCheckFailing() + => await Assert.That(Classify("Running", null, runningButUnhealthy: true, logLines: null)) + .IsEqualTo(ResourceFailureClass.HealthCheckFailing); + + [Test] + public async Task Classify_Starting_IsNeverStarted() + => await Assert.That(Classify("Starting", null, runningButUnhealthy: false, logLines: null)) + .IsEqualTo(ResourceFailureClass.NeverStarted); + + [Test] + public async Task Classify_Waiting_IsNeverStarted() + => await Assert.That(Classify("Waiting", null, runningButUnhealthy: false, logLines: null)) + .IsEqualTo(ResourceFailureClass.NeverStarted); + + [Test] + public async Task Classify_RuntimeUnhealthyState_IsContainerRuntimeDown() + => await Assert.That(Classify("RuntimeUnhealthy", null, runningButUnhealthy: false, logLines: null)) + .IsEqualTo(ResourceFailureClass.ContainerRuntimeDown); + + [Test] + public async Task Classify_DockerDaemonLogSignature_IsContainerRuntimeDown() + { + string[] logs = ["Cannot connect to the Docker daemon at unix:///var/run/docker.sock"]; + + // Log signature wins even when the snapshot state is unknown. + await Assert.That(Classify("unknown", null, runningButUnhealthy: false, logs)) + .IsEqualTo(ResourceFailureClass.ContainerRuntimeDown); + } + + [Test] + public async Task Classify_PortInUseLogSignature_IsPortInUse() + { + string[] logs = ["Error: bind: address already in use"]; + + await Assert.That(Classify("FailedToStart", 1, runningButUnhealthy: false, logs)) + .IsEqualTo(ResourceFailureClass.PortInUse); + } + + [Test] + public async Task ScanLogSignatures_ImagePull_IsImagePullFailure() + => await Assert.That(ScanLogSignatures(["Error response from daemon: pull access denied for foo/bar"])) + .IsEqualTo(ResourceFailureClass.ImagePullFailure); + + [Test] + public async Task ScanLogSignatures_NoSignature_IsNull() + => await Assert.That(ScanLogSignatures(["starting up", "listening on port 8080"])) + .IsNull(); + + [Test] + public async Task HintFor_KnownClass_IsActionable() + => await Assert.That(HintFor(ResourceFailureClass.OutOfMemory)).Contains("OOM"); + + [Test] + public async Task HintFor_Unknown_IsNull() + => await Assert.That(HintFor(ResourceFailureClass.Unknown)).IsNull(); + + [Test] + public async Task DescribeState_OutOfMemory_MentionsOom() + { + var diagnosis = new ResourceDiagnosis("cache", "Exited", 137, ResourceFailureClass.OutOfMemory, null, null); + + await Assert.That(DescribeState(diagnosis)).Contains("OOM / SIGKILL"); + } + + [Test] + public async Task DescribeState_HealthCheckFailing_IncludesDetail() + { + var diagnosis = new ResourceDiagnosis( + "api", "Running", null, ResourceFailureClass.HealthCheckFailing, + Detail: "ready Unhealthy: connection refused", Hint: null); + + var description = DescribeState(diagnosis); + + await Assert.That(description).Contains("Running but not Healthy"); + await Assert.That(description).Contains("connection refused"); + } + + [Test] + public async Task DescribeState_NeverStartedWithDependency_NamesDependency() + { + var diagnosis = new ResourceDiagnosis( + "api", "Waiting", null, ResourceFailureClass.NeverStarted, + Detail: "'migrations' (Exited, exit code 1)", Hint: null); + + await Assert.That(DescribeState(diagnosis)).Contains("waiting on 'migrations'"); + } + + [Test] + public async Task FormatTimeline_NoEntries_IsEmpty() + => await Assert.That(FormatTimeline([])).IsEmpty(); + + [Test] + public async Task FormatTimeline_SingleEntry_StillRenders() + { + StateTransition[] single = [new(TimeSpan.Zero, "api", "(none)", "FailedToStart", null)]; + + await Assert.That(FormatTimeline(single)).Contains("[api] (none) -> FailedToStart"); + } + + [Test] + public async Task TrimTimeline_KeepsHeadAndTail_DropsMiddle() + { + var list = new List(); + for (var i = 0; i < 400; i++) + { + list.Add(new StateTransition(TimeSpan.FromSeconds(i), "r", i.ToString(), (i + 1).ToString(), null)); + } + + TrimTimeline(list, head: 80, tail: 120); + + await Assert.That(list.Count).IsEqualTo(200); + await Assert.That(list[0].From).IsEqualTo("0"); // earliest startup transition preserved + await Assert.That(list[^1].To).IsEqualTo("400"); // most recent transition preserved + } + + [Test] + public async Task TrimTimeline_UnderCap_LeavesUnchanged() + { + var list = new List + { + new(TimeSpan.Zero, "r", "(none)", "Starting", null), + new(TimeSpan.FromSeconds(1), "r", "Starting", "Running", null), + }; + + TrimTimeline(list, head: 80, tail: 120); + + await Assert.That(list.Count).IsEqualTo(2); + } + + [Test] + public async Task FormatTimeline_MultipleEntries_RendersTransitions() + { + StateTransition[] transitions = + [ + new(TimeSpan.Zero, "api", "(none)", "Starting", null), + new(TimeSpan.FromSeconds(1.2), "api", "Starting", "Running", null), + new(TimeSpan.FromSeconds(3.4), "api", "Running", "FailedToStart", null), + ]; + + var timeline = FormatTimeline(transitions); + + await Assert.That(timeline).Contains("[api] Starting -> Running"); + await Assert.That(timeline).Contains("FailedToStart"); + } +}