diff --git a/Brainarr.Tests/Services/Resilience/CircuitBreakerCharacterizationTests.cs b/Brainarr.Tests/Services/Resilience/CircuitBreakerCharacterizationTests.cs new file mode 100644 index 00000000..d9969d0d --- /dev/null +++ b/Brainarr.Tests/Services/Resilience/CircuitBreakerCharacterizationTests.cs @@ -0,0 +1,574 @@ +using System; +using System.Net.Http; +using System.Threading; +using System.Threading.Tasks; +using FluentAssertions; +using NLog; +using NzbDrone.Core.ImportLists.Brainarr.Services.Resilience; +using Xunit; + +namespace Brainarr.Tests.Services.Resilience +{ + /// + /// Characterization tests that lock down the current circuit breaker behavior. + /// These tests document existing semantics to make WS4.2 migration decisions objective. + /// + [Trait("Category", "Unit")] + public sealed class CircuitBreakerCharacterizationTests + { + private static Logger L => LogManager.GetCurrentClassLogger(); + + [Fact] + public void Starts_Closed() + { + var cb = new CircuitBreaker("ai:test:model", CircuitBreakerOptions.Default, L); + + cb.State.Should().Be(CircuitState.Closed); + cb.ConsecutiveFailures.Should().Be(0); + cb.FailureRate.Should().Be(0); + } + + [Fact] + public async Task Opens_After_Handled_Exception_And_Blocks_While_Open() + { + var options = new CircuitBreakerOptions + { + FailureThreshold = 1, + FailureRateThreshold = 1.0, + BreakDuration = TimeSpan.FromMinutes(10), + HalfOpenSuccessThreshold = 1, + SamplingWindowSize = 10, + MinimumThroughput = 1 + }; + var cb = new CircuitBreaker("ai:test:model", options, L); + + await Assert.ThrowsAsync(async () => + await cb.ExecuteAsync(() => Task.FromException(new TimeoutException("timeout")))); + + cb.State.Should().Be(CircuitState.Open); + + await Assert.ThrowsAsync(async () => + await cb.ExecuteAsync(() => Task.FromResult(42))); + } + + [Fact] + public async Task HalfOpen_Success_Closes_When_BreakDuration_Elapsed() + { + var options = new CircuitBreakerOptions + { + FailureThreshold = 1, + FailureRateThreshold = 1.0, + BreakDuration = TimeSpan.Zero, + HalfOpenSuccessThreshold = 1, + SamplingWindowSize = 10, + MinimumThroughput = 1 + }; + var cb = new CircuitBreaker("ai:test:model", options, L); + + await Assert.ThrowsAsync(async () => + await cb.ExecuteAsync(() => Task.FromException(new TimeoutException("timeout")))); + + cb.State.Should().Be(CircuitState.Open); + + var result = await cb.ExecuteAsync(() => Task.FromResult(42)); + result.Should().Be(42); + cb.State.Should().Be(CircuitState.Closed); + } + + [Fact] + public async Task ExecuteWithFallback_Returns_Fallback_When_Open_And_Does_Not_Invoke_Operation() + { + var options = new CircuitBreakerOptions + { + FailureThreshold = 1, + FailureRateThreshold = 1.0, + BreakDuration = TimeSpan.FromMinutes(10), + HalfOpenSuccessThreshold = 1, + SamplingWindowSize = 10, + MinimumThroughput = 1 + }; + var cb = new CircuitBreaker("ai:test:model", options, L); + + await Assert.ThrowsAsync(async () => + await cb.ExecuteAsync(() => Task.FromException(new TimeoutException("timeout")))); + + var invoked = false; + var fallback = await cb.ExecuteWithFallbackAsync( + () => + { + invoked = true; + return Task.FromResult(123); + }, + fallbackValue: 7); + + fallback.Should().Be(7); + invoked.Should().BeFalse(); + } + + [Fact] + public async Task CircuitOpened_And_Closed_Events_Fire() + { + var options = new CircuitBreakerOptions + { + FailureThreshold = 1, + FailureRateThreshold = 1.0, + BreakDuration = TimeSpan.Zero, + HalfOpenSuccessThreshold = 1, + SamplingWindowSize = 10, + MinimumThroughput = 1 + }; + var cb = new CircuitBreaker("ai:test:model", options, L); + + CircuitBreakerEventArgs? opened = null; + CircuitBreakerEventArgs? closed = null; + + cb.CircuitOpened += (_, args) => opened = args; + cb.CircuitClosed += (_, args) => closed = args; + + await Assert.ThrowsAsync(async () => + await cb.ExecuteAsync(() => Task.FromException(new HttpRequestException("network")))); + + opened.Should().NotBeNull(); + opened!.ResourceName.Should().Be("ai:test:model"); + opened.State.Should().Be(CircuitState.Open); + + await cb.ExecuteAsync(() => Task.FromResult(1)); + + closed.Should().NotBeNull(); + closed!.ResourceName.Should().Be("ai:test:model"); + closed.State.Should().Be(CircuitState.Closed); + } + + [Fact] + public async Task Reset_Closes_And_Clears_Statistics() + { + var options = new CircuitBreakerOptions + { + FailureThreshold = 1, + FailureRateThreshold = 1.0, + BreakDuration = TimeSpan.FromMinutes(10), + HalfOpenSuccessThreshold = 1, + SamplingWindowSize = 10, + MinimumThroughput = 1 + }; + var cb = new CircuitBreaker("ai:test:model", options, L); + + await Assert.ThrowsAsync(async () => + await cb.ExecuteAsync(() => Task.FromException(new TimeoutException("timeout")))); + + cb.State.Should().Be(CircuitState.Open); + cb.Reset(); + + cb.State.Should().Be(CircuitState.Closed); + cb.ConsecutiveFailures.Should().Be(0); + cb.GetStatistics().TotalOperations.Should().Be(0); + } + + #region Keying Scheme Tests + + [Fact] + public void ResourceName_Uses_Keying_Format() + { + // The keying format is "ai:{provider}:{modelId}" as established in BreakerRegistry + var cb = new CircuitBreaker("ai:openai:gpt-4", CircuitBreakerOptions.Default, L); + cb.ResourceName.Should().Be("ai:openai:gpt-4"); + } + + [Theory] + [InlineData("ai:anthropic:claude-3-opus")] + [InlineData("ai:ollama:llama2")] + [InlineData("ai:deepseek:deepseek-chat")] + public void ResourceName_Preserved_For_Any_Provider_Model_Combination(string resourceName) + { + var cb = new CircuitBreaker(resourceName, CircuitBreakerOptions.Default, L); + cb.ResourceName.Should().Be(resourceName); + } + + #endregion + + #region Failure Classification Tests + + [Fact] + public async Task TaskCanceledException_Is_Treated_As_Failure() + { + // CURRENT BEHAVIOR: TaskCanceledException trips the breaker + // This may be surprising - cancellation is treated same as failure + var options = new CircuitBreakerOptions + { + FailureThreshold = 1, + FailureRateThreshold = 1.0, + BreakDuration = TimeSpan.FromMinutes(10), + MinimumThroughput = 1 + }; + var cb = new CircuitBreaker("ai:test:model", options, L); + + await Assert.ThrowsAsync(async () => + await cb.ExecuteAsync(() => Task.FromException(new TaskCanceledException("cancelled")))); + + cb.State.Should().Be(CircuitState.Open, "TaskCanceledException is treated as a failure"); + cb.ConsecutiveFailures.Should().Be(1); + } + + [Fact] + public async Task HttpRequestException_Is_Treated_As_Failure() + { + var options = new CircuitBreakerOptions + { + FailureThreshold = 1, + FailureRateThreshold = 1.0, + BreakDuration = TimeSpan.FromMinutes(10), + MinimumThroughput = 1 + }; + var cb = new CircuitBreaker("ai:test:model", options, L); + + await Assert.ThrowsAsync(async () => + await cb.ExecuteAsync(() => Task.FromException(new HttpRequestException("network error")))); + + cb.State.Should().Be(CircuitState.Open); + cb.ConsecutiveFailures.Should().Be(1); + } + + [Fact] + public async Task Client_Error_With_BadRequest_Does_Not_Trip_Breaker() + { + // CURRENT BEHAVIOR: HttpRequestException with "4" AND "Bad Request" in message is excluded + // This is brittle string-based client error detection (should use status codes in future) + // + // NOTE: This test is intentionally message-coupled because production uses message-based + // detection (ex.Message.Contains("4") && ex.Message.Contains("Bad Request")). Do not + // "simplify" to status-code checks unless production is also refactored. + var options = new CircuitBreakerOptions + { + FailureThreshold = 1, + FailureRateThreshold = 1.0, + BreakDuration = TimeSpan.FromMinutes(10), + MinimumThroughput = 1 + }; + var cb = new CircuitBreaker("ai:test:model", options, L); + + // HttpRequestException (normally a handled type) with "4" + "Bad Request" - excluded by string match + var clientError = new HttpRequestException("400 Bad Request"); + await Assert.ThrowsAsync(async () => + await cb.ExecuteAsync(() => Task.FromException(clientError))); + + cb.State.Should().Be(CircuitState.Closed, "Client errors excluded by string matching"); + cb.ConsecutiveFailures.Should().Be(0); + } + + [Fact] + public async Task Generic_Exception_Without_BadRequest_Does_Not_Trip_Breaker() + { + // Non-handled exceptions pass through without recording failure + var options = new CircuitBreakerOptions + { + FailureThreshold = 1, + FailureRateThreshold = 1.0, + BreakDuration = TimeSpan.FromMinutes(10), + MinimumThroughput = 1 + }; + var cb = new CircuitBreaker("ai:test:model", options, L); + + var genericError = new InvalidOperationException("some logic error"); + await Assert.ThrowsAsync(async () => + await cb.ExecuteAsync(() => Task.FromException(genericError))); + + cb.State.Should().Be(CircuitState.Closed, "Non-handled exceptions don't trip breaker"); + cb.ConsecutiveFailures.Should().Be(0, "Non-handled exceptions don't record as failures"); + } + + #endregion + + #region Consecutive Failures Threshold Tests + + [Fact] + public async Task Opens_After_Consecutive_Failures_Threshold() + { + var options = new CircuitBreakerOptions + { + FailureThreshold = 5, // Default + FailureRateThreshold = 1.0, // Disable rate-based opening + BreakDuration = TimeSpan.FromMinutes(10), + MinimumThroughput = 100 // High minimum to prevent rate-based opening + }; + var cb = new CircuitBreaker("ai:test:model", options, L); + + // 4 failures - should remain closed + for (int i = 0; i < 4; i++) + { + await Assert.ThrowsAsync(async () => + await cb.ExecuteAsync(() => Task.FromException(new TimeoutException()))); + } + cb.State.Should().Be(CircuitState.Closed); + cb.ConsecutiveFailures.Should().Be(4); + + // 5th failure - opens the circuit + await Assert.ThrowsAsync(async () => + await cb.ExecuteAsync(() => Task.FromException(new TimeoutException()))); + + cb.State.Should().Be(CircuitState.Open); + cb.ConsecutiveFailures.Should().Be(5); + } + + [Fact] + public async Task Success_Resets_Consecutive_Failure_Counter() + { + var options = new CircuitBreakerOptions + { + FailureThreshold = 5, + FailureRateThreshold = 1.0, + BreakDuration = TimeSpan.FromMinutes(10), + MinimumThroughput = 100 + }; + var cb = new CircuitBreaker("ai:test:model", options, L); + + // 3 failures + for (int i = 0; i < 3; i++) + { + await Assert.ThrowsAsync(async () => + await cb.ExecuteAsync(() => Task.FromException(new TimeoutException()))); + } + cb.ConsecutiveFailures.Should().Be(3); + + // 1 success resets counter + await cb.ExecuteAsync(() => Task.FromResult(42)); + cb.ConsecutiveFailures.Should().Be(0); + + // Circuit should still be closed + cb.State.Should().Be(CircuitState.Closed); + } + + #endregion + + #region Failure Rate Threshold Tests + + [Fact] + public async Task Opens_When_Failure_Rate_Exceeds_Threshold_And_Minimum_Throughput_Met() + { + var options = new CircuitBreakerOptions + { + FailureThreshold = 100, // High to prevent consecutive-based opening + FailureRateThreshold = 0.5, // 50% + BreakDuration = TimeSpan.FromMinutes(10), + SamplingWindowSize = 20, + MinimumThroughput = 10 + }; + var cb = new CircuitBreaker("ai:test:model", options, L); + + // 5 successes + for (int i = 0; i < 5; i++) + { + await cb.ExecuteAsync(() => Task.FromResult(i)); + } + cb.State.Should().Be(CircuitState.Closed); + + // 4 failures (4/9 = 44% < 50%, circuit stays closed) + for (int i = 0; i < 4; i++) + { + await Assert.ThrowsAsync(async () => + await cb.ExecuteAsync(() => Task.FromException(new TimeoutException()))); + } + cb.State.Should().Be(CircuitState.Closed, "9 ops at 44% failure rate - below threshold"); + + // 1 more failure (5/10 = 50%, meets threshold and minimum throughput) + await Assert.ThrowsAsync(async () => + await cb.ExecuteAsync(() => Task.FromException(new TimeoutException()))); + + cb.State.Should().Be(CircuitState.Open, "10 ops at 50% failure rate - meets threshold"); + cb.FailureRate.Should().BeApproximately(0.5, 0.01); + } + + [Fact] + public async Task Does_Not_Open_On_High_Failure_Rate_Below_Minimum_Throughput() + { + var options = new CircuitBreakerOptions + { + FailureThreshold = 100, // High to prevent consecutive-based opening + FailureRateThreshold = 0.5, // 50% + BreakDuration = TimeSpan.FromMinutes(10), + SamplingWindowSize = 20, + MinimumThroughput = 10 + }; + var cb = new CircuitBreaker("ai:test:model", options, L); + + // 1 success, 3 failures (75% failure rate but only 4 ops < 10 minimum) + await cb.ExecuteAsync(() => Task.FromResult(1)); + for (int i = 0; i < 3; i++) + { + await Assert.ThrowsAsync(async () => + await cb.ExecuteAsync(() => Task.FromException(new TimeoutException()))); + } + + cb.FailureRate.Should().BeApproximately(0.75, 0.01); + cb.State.Should().Be(CircuitState.Closed, "Below minimum throughput - rate-based opening disabled"); + } + + #endregion + + #region Half-Open State Transition Tests + + [Fact] + public async Task HalfOpen_Closes_After_Configured_Successes() + { + var options = new CircuitBreakerOptions + { + FailureThreshold = 1, + FailureRateThreshold = 1.0, + BreakDuration = TimeSpan.Zero, // Immediate transition to half-open + HalfOpenSuccessThreshold = 3, + MinimumThroughput = 1 + }; + var cb = new CircuitBreaker("ai:test:model", options, L); + + // Open the circuit + await Assert.ThrowsAsync(async () => + await cb.ExecuteAsync(() => Task.FromException(new TimeoutException()))); + cb.State.Should().Be(CircuitState.Open); + + // First success - transitions to half-open, stays half-open + await cb.ExecuteAsync(() => Task.FromResult(1)); + cb.State.Should().Be(CircuitState.HalfOpen, "1 success in half-open, need 3 to close"); + + // Second success + await cb.ExecuteAsync(() => Task.FromResult(2)); + cb.State.Should().Be(CircuitState.HalfOpen, "2 successes in half-open, need 3 to close"); + + // Third success - closes circuit + await cb.ExecuteAsync(() => Task.FromResult(3)); + cb.State.Should().Be(CircuitState.Closed, "3 successes closes the circuit"); + } + + [Fact] + public async Task HalfOpen_Failure_Immediately_Reopens() + { + var options = new CircuitBreakerOptions + { + FailureThreshold = 1, + FailureRateThreshold = 1.0, + BreakDuration = TimeSpan.Zero, + HalfOpenSuccessThreshold = 3, + MinimumThroughput = 1 + }; + var cb = new CircuitBreaker("ai:test:model", options, L); + + // Open the circuit + await Assert.ThrowsAsync(async () => + await cb.ExecuteAsync(() => Task.FromException(new TimeoutException()))); + cb.State.Should().Be(CircuitState.Open); + + // 1 success to enter half-open + await cb.ExecuteAsync(() => Task.FromResult(1)); + cb.State.Should().Be(CircuitState.HalfOpen); + + // Failure in half-open immediately reopens + await Assert.ThrowsAsync(async () => + await cb.ExecuteAsync(() => Task.FromException(new TimeoutException()))); + + cb.State.Should().Be(CircuitState.Open, "Any failure in half-open reopens the circuit"); + } + + #endregion + + #region Windowing / CircularBuffer Tests + + [Fact] + public async Task CircularBuffer_Wraps_And_Maintains_Accurate_FailureRate() + { + // This test verifies that failure rate is calculated over the sliding window, + // and that old operations get pushed out as new ones arrive. + var options = new CircuitBreakerOptions + { + FailureThreshold = 100, // High to prevent consecutive-based opening + FailureRateThreshold = 0.7, // 70% - higher threshold to observe window behavior + BreakDuration = TimeSpan.FromMinutes(10), + SamplingWindowSize = 5, // Small window for testing + MinimumThroughput = 3 + }; + var cb = new CircuitBreaker("ai:test:model", options, L); + + // Phase 1: Fill buffer with 3 successes, 2 failures = 40% failure rate + // Window: [S, S, S, F, F] + for (int i = 0; i < 3; i++) await cb.ExecuteAsync(() => Task.FromResult(i)); + for (int i = 0; i < 2; i++) + { + await Assert.ThrowsAsync(async () => + await cb.ExecuteAsync(() => Task.FromException(new TimeoutException()))); + } + + cb.FailureRate.Should().BeApproximately(0.4, 0.01); + cb.State.Should().Be(CircuitState.Closed); + + // Phase 2: Add 1 more failure - pushes out oldest success + // Window: [S, S, F, F, F] = 60% failure rate (still below 70%) + await Assert.ThrowsAsync(async () => + await cb.ExecuteAsync(() => Task.FromException(new TimeoutException()))); + + cb.FailureRate.Should().BeApproximately(0.6, 0.01); + cb.State.Should().Be(CircuitState.Closed, "60% is below 70% threshold"); + + // Phase 3: Add 1 more failure - pushes out another success + // Window: [S, F, F, F, F] = 80% failure rate (exceeds 70%) + await Assert.ThrowsAsync(async () => + await cb.ExecuteAsync(() => Task.FromException(new TimeoutException()))); + + cb.FailureRate.Should().BeGreaterThanOrEqualTo(0.7); + cb.State.Should().Be(CircuitState.Open, "Window wrapped, failure rate now exceeds threshold"); + } + + [Fact] + public void GetStatistics_Returns_Correct_Initial_State() + { + var options = new CircuitBreakerOptions { SamplingWindowSize = 10 }; + var cb = new CircuitBreaker("ai:test:model", options, L); + + var stats = cb.GetStatistics(); + stats.ResourceName.Should().Be("ai:test:model"); + stats.State.Should().Be(CircuitState.Closed); + stats.TotalOperations.Should().Be(0); + stats.ConsecutiveFailures.Should().Be(0); + stats.FailureRate.Should().Be(0); + stats.NextHalfOpenAttempt.Should().BeNull(); + } + + #endregion + + #region Configuration Constants Tests + + [Fact] + public void Default_Options_Use_Brainarr_Constants() + { + // Document the default configuration values from BrainarrConstants + var defaults = CircuitBreakerOptions.Default; + + // These values come from BrainarrConstants + defaults.FailureThreshold.Should().Be(5, "consecutive failures to open"); + defaults.FailureRateThreshold.Should().Be(0.5, "50% failure rate threshold"); + defaults.BreakDuration.Should().Be(TimeSpan.FromSeconds(30), "30 second open duration"); + defaults.HalfOpenSuccessThreshold.Should().Be(3, "3 successes to close from half-open"); + defaults.SamplingWindowSize.Should().Be(20, "20 operation sampling window"); + defaults.MinimumThroughput.Should().Be(10, "10 minimum operations for rate-based opening"); + } + + [Fact] + public void Aggressive_Options_Presets() + { + var aggressive = CircuitBreakerOptions.Aggressive; + + aggressive.FailureThreshold.Should().Be(3); + aggressive.FailureRateThreshold.Should().Be(0.3); + aggressive.BreakDuration.Should().Be(TimeSpan.FromMinutes(5)); + } + + [Fact] + public void Lenient_Options_Presets() + { + var lenient = CircuitBreakerOptions.Lenient; + + lenient.FailureThreshold.Should().Be(10); + lenient.FailureRateThreshold.Should().Be(0.75); + lenient.BreakDuration.Should().Be(TimeSpan.FromSeconds(30)); + } + + #endregion + } +} +