From a9cf378192ec339335efb8322aea09755861811b Mon Sep 17 00:00:00 2001 From: Francois ROBION Date: Tue, 30 Sep 2025 16:19:31 +0200 Subject: [PATCH 1/2] Allow heartbeat to restart the pipe thread with only sync commands There is a thread looping in the method PhysicalConnection.ReadFromPipe to process response from Redis, match them with the sent command and signaling the completion of the message. If this thread has an exception, its catch block will call RecordConnectionFailed which will proceed to restart a new thread to continue reading Redis responses. However, if another exception occurred in the catch before the new thread can be started (in a case of high memory pressure, OOM exceptions can happen anywhere) we are in a state where no one is reading the pipe of Redis responses, and all commands sent end in timeout. If at least one async command is sent, the heartbeat thread will detect the timeout in the OnBridgeHeartbeat method, and if no read were perform for 4 heartbeat it will issue a connection failure. With this commit, this becomes true for sync commands as well. Therefore, it ensures we will not reach a state were all commands end in timeout. --- src/StackExchange.Redis/PhysicalBridge.cs | 7 ++++--- src/StackExchange.Redis/PhysicalConnection.cs | 17 ++++++++++++----- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/StackExchange.Redis/PhysicalBridge.cs b/src/StackExchange.Redis/PhysicalBridge.cs index c430cf5af..95b1d62e9 100644 --- a/src/StackExchange.Redis/PhysicalBridge.cs +++ b/src/StackExchange.Redis/PhysicalBridge.cs @@ -615,7 +615,7 @@ internal void OnHeartbeat(bool ifConnectedOnly) Interlocked.Exchange(ref connectTimeoutRetryCount, 0); tmp.BridgeCouldBeNull?.ServerEndPoint?.ClearUnselectable(UnselectableFlags.DidNotRespond); } - int timedOutThisHeartbeat = tmp.OnBridgeHeartbeat(); + tmp.OnBridgeHeartbeat(out int asyncTimeoutThisHeartbeat, out int syncTimeoutThisHeartbeat); int writeEverySeconds = ServerEndPoint.WriteEverySeconds; bool configCheckDue = ServerEndPoint.ConfigCheckSeconds > 0 && ServerEndPoint.LastInfoReplicationCheckSecondsAgo >= ServerEndPoint.ConfigCheckSeconds; @@ -664,14 +664,15 @@ internal void OnHeartbeat(bool ifConnectedOnly) } // This is an "always" check - we always want to evaluate a dead connection from a non-responsive sever regardless of the need to heartbeat above - if (timedOutThisHeartbeat > 0 + var totalTimeoutThisHeartbeat = asyncTimeoutThisHeartbeat + syncTimeoutThisHeartbeat; + if ((totalTimeoutThisHeartbeat > 0) && tmp.LastReadSecondsAgo * 1_000 > (tmp.BridgeCouldBeNull?.Multiplexer.AsyncTimeoutMilliseconds * 4)) { // If we've received *NOTHING* on the pipe in 4 timeouts worth of time and we're timing out commands, issue a connection failure so that we reconnect // This is meant to address the scenario we see often in Linux configs where TCP retries will happen for 15 minutes. // To us as a client, we'll see the socket as green/open/fine when writing but we'll bet getting nothing back. // Since we can't depend on the pipe to fail in that case, we want to error here based on the criteria above so we reconnect broken clients much faster. - tmp.BridgeCouldBeNull?.Multiplexer.Logger?.LogWarningDeadSocketDetected(tmp.LastReadSecondsAgo, timedOutThisHeartbeat); + tmp.BridgeCouldBeNull?.Multiplexer.Logger?.LogWarningDeadSocketDetected(tmp.LastReadSecondsAgo, totalTimeoutThisHeartbeat); OnDisconnected(ConnectionFailureType.SocketFailure, tmp, out _, out State oldState); tmp.Dispose(); // Cleanup the existing connection/socket if any, otherwise it will wait reading indefinitely } diff --git a/src/StackExchange.Redis/PhysicalConnection.cs b/src/StackExchange.Redis/PhysicalConnection.cs index 129fd9e07..4b86d7a51 100644 --- a/src/StackExchange.Redis/PhysicalConnection.cs +++ b/src/StackExchange.Redis/PhysicalConnection.cs @@ -746,10 +746,12 @@ internal void GetStormLog(StringBuilder sb) /// /// Runs on every heartbeat for a bridge, timing out any commands that are overdue and returning an integer of how many we timed out. /// - /// How many commands were overdue and threw timeout exceptions. - internal int OnBridgeHeartbeat() + /// How many async commands were overdue and threw timeout exceptions. + /// How many sync commands were overdue. No exception are thrown for these commands here. + internal void OnBridgeHeartbeat(out int asyncTimeoutDetected, out int syncTimeoutDetected) { - var result = 0; + asyncTimeoutDetected = 0; + syncTimeoutDetected = 0; var now = Environment.TickCount; Interlocked.Exchange(ref lastBeatTickCount, now); @@ -776,7 +778,13 @@ internal int OnBridgeHeartbeat() multiplexer.OnMessageFaulted(msg, timeoutEx); msg.SetExceptionAndComplete(timeoutEx, bridge); // tell the message that it is doomed multiplexer.OnAsyncTimeout(); - result++; + asyncTimeoutDetected++; + } + else + { + // Only count how many sync timeouts we detect here. + // The actual timeout is handled in ConnectionMultiplexer.ExecuteSyncImpl(). + syncTimeoutDetected++; } } else @@ -791,7 +799,6 @@ internal int OnBridgeHeartbeat() } } } - return result; } internal void OnInternalError(Exception exception, [CallerMemberName] string? origin = null) From 0ec7c4aa48919bc137e4f2e893d6d32d0c28ee71 Mon Sep 17 00:00:00 2001 From: Francois ROBION Date: Thu, 16 Oct 2025 11:22:40 +0200 Subject: [PATCH 2/2] Compare count of sync commands timeouted with sync timeout --- src/StackExchange.Redis/PhysicalBridge.cs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/StackExchange.Redis/PhysicalBridge.cs b/src/StackExchange.Redis/PhysicalBridge.cs index 95b1d62e9..ecaf3be0d 100644 --- a/src/StackExchange.Redis/PhysicalBridge.cs +++ b/src/StackExchange.Redis/PhysicalBridge.cs @@ -665,8 +665,9 @@ internal void OnHeartbeat(bool ifConnectedOnly) // This is an "always" check - we always want to evaluate a dead connection from a non-responsive sever regardless of the need to heartbeat above var totalTimeoutThisHeartbeat = asyncTimeoutThisHeartbeat + syncTimeoutThisHeartbeat; - if ((totalTimeoutThisHeartbeat > 0) - && tmp.LastReadSecondsAgo * 1_000 > (tmp.BridgeCouldBeNull?.Multiplexer.AsyncTimeoutMilliseconds * 4)) + bool deadConnectionOnAsync = asyncTimeoutThisHeartbeat > 0 && tmp.LastReadSecondsAgo * 1_000 > (tmp.BridgeCouldBeNull?.Multiplexer.AsyncTimeoutMilliseconds * 4); + bool deadConnectionOnSync = syncTimeoutThisHeartbeat > 0 && tmp.LastReadSecondsAgo * 1_000 > (tmp.BridgeCouldBeNull?.Multiplexer.TimeoutMilliseconds * 4); + if (deadConnectionOnAsync || deadConnectionOnSync) { // If we've received *NOTHING* on the pipe in 4 timeouts worth of time and we're timing out commands, issue a connection failure so that we reconnect // This is meant to address the scenario we see often in Linux configs where TCP retries will happen for 15 minutes.