-
Notifications
You must be signed in to change notification settings - Fork 56
Harden gRPC worker and client against silent disconnects #708
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 4 commits
Commits
Show all changes
30 commits
Select commit
Hold shift + click to select a range
5f58004
Harden gRPC worker and client against silent disconnects
Copilot 71bca4c
Address CodeQL feedback: drop ReferenceEquals on struct, narrow gener…
Copilot 6533658
Sort new log event additions by EventId in Worker and Client Logs.cs
Copilot 665ba0e
Address Copilot review: clamp backoff cap, cover SilentDisconnectTime…
Copilot 4d1fbbe
Reconnect after graceful stream-end (GOAWAY) and thread-safe state swap
Copilot e0fc2e8
Fix critical bugs surfaced by code review
5dfc5b1
Address remaining audit nits
f71d510
Add M1: regression test for silent-disconnect classification
cec5f4a
Materialize new channel before swapping cache entry
ab763db
Address Copilot review feedback on cec5f4a
955ea63
Address Copilot review feedback on ab763db
5186ae1
Merge branch 'main' into fix/grpc-resilience-channel-recreate
berndverst 7df10dc
Address PR review follow-up on gRPC resilience
add1a39
Address PR feedback for gRPC recreation
4bc4a08
Stabilize WorkItemStreamConsumer timing test
aba04b7
Clamp worker hello deadline to UTC max
5594c7b
Simplify worker recreate flow
a754b55
Merge branch 'main' into fix/grpc-resilience-channel-recreate
berndverst c0dbb3d
Fix worker recreate ownership
c08076e
Simplify worker channel cache
fd9381d
Fix worker recreate disposal timing
dc243e1
Fix continue-as-new event carryover
18e23a3
Add worker disconnect coverage tests
dd9ad77
Fix fatal deferred-dispose filters
4d30b9b
Fix client recreate dispose race
caf307b
Ignore local git worktrees
c85234d
Move wrapper changes to separate PR
f246e90
Keep worktree ignore local
d291efd
Address latest PR feedback on reconnect cleanup
729654a
Merge branch 'main' into fix/grpc-resilience-channel-recreate
berndverst File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Some comments aren't visible on the classic Files Changed page.
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,270 @@ | ||
| // Copyright (c) Microsoft Corporation. | ||
| // Licensed under the MIT License. | ||
|
|
||
| using System.Diagnostics; | ||
| using Microsoft.Extensions.Logging; | ||
|
|
||
| namespace Microsoft.DurableTask.Client.Grpc; | ||
|
|
||
| /// <summary> | ||
| /// A <see cref="CallInvoker"/> wrapper that observes RPC outcomes and triggers a fire-and-forget channel | ||
| /// recreation after a configurable number of consecutive transport failures | ||
| /// (<see cref="StatusCode.Unavailable"/>, or <see cref="StatusCode.DeadlineExceeded"/> on RPCs that are | ||
| /// not long-poll waits). This guards against half-open HTTP/2 connections that can otherwise wedge | ||
| /// an entire client process for the lifetime of the gRPC channel. | ||
| /// </summary> | ||
| /// <remarks> | ||
| /// <para>The wrapper holds an immutable <see cref="TransportState"/> (channel + invoker pair) and swaps | ||
| /// the entire pair atomically on recreate to avoid torn state. Streaming RPCs are forwarded without | ||
| /// outcome observation; only unary RPC outcomes count toward the failure threshold.</para> | ||
| /// <para>The triggering RPC still surfaces its original failure to the caller; subsequent RPCs benefit | ||
| /// from the recreated channel.</para> | ||
| /// </remarks> | ||
| sealed class ChannelRecreatingCallInvoker : CallInvoker, IAsyncDisposable | ||
| { | ||
| /// <summary> | ||
| /// Methods on which a <see cref="StatusCode.DeadlineExceeded"/> response is expected behavior | ||
| /// (long-poll-style waits) and must NOT be counted toward the recreate threshold. | ||
| /// </summary> | ||
| static readonly HashSet<string> DeadlineExceededAllowedMethods = new(StringComparer.Ordinal) | ||
| { | ||
| "/TaskHubSidecarService/WaitForInstanceCompletion", | ||
|
sophiatev marked this conversation as resolved.
|
||
| "/TaskHubSidecarService/WaitForInstanceStart", | ||
| }; | ||
|
|
||
| readonly Func<GrpcChannel, CancellationToken, Task<GrpcChannel>> recreator; | ||
| readonly int failureThreshold; | ||
| readonly TimeSpan minRecreateInterval; | ||
| readonly bool ownsChannel; | ||
| readonly ILogger logger; | ||
|
|
||
| TransportState state; | ||
| int consecutiveFailures; | ||
| int recreateInFlight; | ||
| long lastRecreateTicks; | ||
|
berndverst marked this conversation as resolved.
Outdated
|
||
|
|
||
| public ChannelRecreatingCallInvoker( | ||
| GrpcChannel initialChannel, | ||
| Func<GrpcChannel, CancellationToken, Task<GrpcChannel>> recreator, | ||
| int failureThreshold, | ||
| TimeSpan minRecreateInterval, | ||
| bool ownsChannel, | ||
| ILogger logger) | ||
| { | ||
| this.recreator = recreator; | ||
| this.failureThreshold = failureThreshold; | ||
| this.minRecreateInterval = minRecreateInterval; | ||
| this.ownsChannel = ownsChannel; | ||
| this.logger = logger; | ||
| this.state = new TransportState(initialChannel, initialChannel.CreateCallInvoker()); | ||
|
|
||
| // Seed lastRecreateTicks so cooldown does not block the very first recreate attempt. | ||
| this.lastRecreateTicks = Stopwatch.GetTimestamp() - StopwatchTicksFor(minRecreateInterval); | ||
|
berndverst marked this conversation as resolved.
Outdated
|
||
| } | ||
|
|
||
| public override TResponse BlockingUnaryCall<TRequest, TResponse>( | ||
| Method<TRequest, TResponse> method, string? host, CallOptions options, TRequest request) | ||
| { | ||
| TransportState current = this.state; | ||
| try | ||
| { | ||
| TResponse response = current.Invoker.BlockingUnaryCall(method, host, options, request); | ||
| this.RecordSuccess(); | ||
| return response; | ||
| } | ||
| catch (RpcException ex) | ||
| { | ||
| this.RecordFailure(ex.StatusCode, method.FullName); | ||
| throw; | ||
| } | ||
| } | ||
|
|
||
| public override AsyncUnaryCall<TResponse> AsyncUnaryCall<TRequest, TResponse>( | ||
| Method<TRequest, TResponse> method, string? host, CallOptions options, TRequest request) | ||
| { | ||
| TransportState current = this.state; | ||
| AsyncUnaryCall<TResponse> call = current.Invoker.AsyncUnaryCall(method, host, options, request); | ||
| this.ObserveOutcome(call.ResponseAsync, method.FullName); | ||
| return call; | ||
| } | ||
|
|
||
| public override AsyncServerStreamingCall<TResponse> AsyncServerStreamingCall<TRequest, TResponse>( | ||
| Method<TRequest, TResponse> method, string? host, CallOptions options, TRequest request) | ||
| { | ||
| // Streaming calls are forwarded without outcome observation. The streaming methods used by the | ||
| // DurableTask client are bounded snapshots (e.g. StreamInstanceHistory) where errors surface as | ||
| // exceptions to user code, so global failure counting on these would create false positives. | ||
| return this.state.Invoker.AsyncServerStreamingCall(method, host, options, request); | ||
| } | ||
|
|
||
| public override AsyncClientStreamingCall<TRequest, TResponse> AsyncClientStreamingCall<TRequest, TResponse>( | ||
| Method<TRequest, TResponse> method, string? host, CallOptions options) | ||
| { | ||
| return this.state.Invoker.AsyncClientStreamingCall(method, host, options); | ||
| } | ||
|
|
||
| public override AsyncDuplexStreamingCall<TRequest, TResponse> AsyncDuplexStreamingCall<TRequest, TResponse>( | ||
| Method<TRequest, TResponse> method, string? host, CallOptions options) | ||
| { | ||
| return this.state.Invoker.AsyncDuplexStreamingCall(method, host, options); | ||
| } | ||
|
|
||
| public async ValueTask DisposeAsync() | ||
| { | ||
| if (!this.ownsChannel) | ||
| { | ||
| return; | ||
| } | ||
|
|
||
| TransportState current = this.state; | ||
| try | ||
| { | ||
| #if NET6_0_OR_GREATER | ||
| await current.Channel.ShutdownAsync().ConfigureAwait(false); | ||
| current.Channel.Dispose(); | ||
| #else | ||
| await current.Channel.ShutdownAsync().ConfigureAwait(false); | ||
|
berndverst marked this conversation as resolved.
Outdated
|
||
| #endif | ||
| } | ||
| catch (Exception ex) when (ex is OperationCanceledException or ObjectDisposedException) | ||
| { | ||
| // Best-effort disposal. | ||
| } | ||
| } | ||
|
|
||
| static long StopwatchTicksFor(TimeSpan ts) => | ||
| (long)(ts.TotalSeconds * Stopwatch.Frequency); | ||
|
|
||
| void ObserveOutcome<TResponse>(Task<TResponse> responseAsync, string methodFullName) | ||
| { | ||
| // Use ContinueWith with TaskScheduler.Default so we don't capture sync context. | ||
| responseAsync.ContinueWith( | ||
| (t, state) => | ||
| { | ||
| var (self, name) = ((ChannelRecreatingCallInvoker, string))state!; | ||
| if (t.Status == TaskStatus.RanToCompletion) | ||
| { | ||
| self.RecordSuccess(); | ||
| } | ||
| else if (t.Exception?.InnerException is RpcException rpcEx) | ||
| { | ||
| self.RecordFailure(rpcEx.StatusCode, name); | ||
| } | ||
| }, | ||
| (this, methodFullName), | ||
| CancellationToken.None, | ||
| TaskContinuationOptions.ExecuteSynchronously, | ||
| TaskScheduler.Default); | ||
| } | ||
|
|
||
| void RecordSuccess() | ||
| { | ||
| Volatile.Write(ref this.consecutiveFailures, 0); | ||
| } | ||
|
|
||
| void RecordFailure(StatusCode status, string methodFullName) | ||
| { | ||
| bool counts = status switch | ||
| { | ||
| StatusCode.Unavailable => true, | ||
| StatusCode.DeadlineExceeded => !DeadlineExceededAllowedMethods.Contains(methodFullName), | ||
| _ => false, | ||
| }; | ||
|
|
||
| if (!counts) | ||
| { | ||
| return; | ||
| } | ||
|
berndverst marked this conversation as resolved.
|
||
|
|
||
| int count = Interlocked.Increment(ref this.consecutiveFailures); | ||
| if (this.failureThreshold <= 0 || count < this.failureThreshold) | ||
| { | ||
| return; | ||
| } | ||
|
|
||
| this.MaybeTriggerRecreate(count); | ||
| } | ||
|
|
||
| void MaybeTriggerRecreate(int observedCount) | ||
| { | ||
| long nowTicks = Stopwatch.GetTimestamp(); | ||
| long elapsedTicks = nowTicks - Volatile.Read(ref this.lastRecreateTicks); | ||
| long cooldownTicks = StopwatchTicksFor(this.minRecreateInterval); | ||
| if (elapsedTicks < cooldownTicks) | ||
| { | ||
| return; | ||
| } | ||
|
|
||
| // Single-flight guard: only one recreate task in flight at a time. | ||
| if (Interlocked.CompareExchange(ref this.recreateInFlight, 1, 0) != 0) | ||
| { | ||
| return; | ||
| } | ||
|
|
||
| // Re-check elapsed under the guard to avoid back-to-back recreates that won the CAS race. | ||
|
sophiatev marked this conversation as resolved.
Outdated
berndverst marked this conversation as resolved.
Outdated
|
||
| elapsedTicks = Stopwatch.GetTimestamp() - Volatile.Read(ref this.lastRecreateTicks); | ||
| if (elapsedTicks < cooldownTicks) | ||
| { | ||
| Interlocked.Exchange(ref this.recreateInFlight, 0); | ||
| return; | ||
| } | ||
|
|
||
| this.logger.RecreatingChannel(observedCount); | ||
| _ = Task.Run(() => this.RecreateAsync(observedCount)); | ||
|
berndverst marked this conversation as resolved.
|
||
| } | ||
|
|
||
| async Task RecreateAsync(int observedCount) | ||
| { | ||
| try | ||
| { | ||
| TransportState current = this.state; | ||
| using CancellationTokenSource cts = new(TimeSpan.FromSeconds(30)); | ||
| GrpcChannel newChannel = await this.recreator(current.Channel, cts.Token).ConfigureAwait(false); | ||
|
|
||
| if (!ReferenceEquals(newChannel, current.Channel)) | ||
| { | ||
| this.state = new TransportState(newChannel, newChannel.CreateCallInvoker()); | ||
|
berndverst marked this conversation as resolved.
Outdated
|
||
| this.logger.ChannelRecreated(GetEndpointDescription(newChannel)); | ||
| } | ||
|
|
||
| // Successful recreate (even if a peer beat us to it) → reset the failure counter. | ||
| Volatile.Write(ref this.consecutiveFailures, 0); | ||
| Volatile.Write(ref this.lastRecreateTicks, Stopwatch.GetTimestamp()); | ||
| } | ||
| catch (Exception ex) when (ex is not OutOfMemoryException | ||
| and not StackOverflowException | ||
| and not ThreadAbortException) | ||
| { | ||
| this.logger.ChannelRecreateFailed(ex); | ||
|
|
||
| // Update lastRecreateTicks even on failure so the cooldown applies to failed attempts too. | ||
| Volatile.Write(ref this.lastRecreateTicks, Stopwatch.GetTimestamp()); | ||
| } | ||
| finally | ||
| { | ||
| Interlocked.Exchange(ref this.recreateInFlight, 0); | ||
| } | ||
| } | ||
|
|
||
| static string GetEndpointDescription(GrpcChannel channel) | ||
| { | ||
| #if NET6_0_OR_GREATER | ||
| return channel.Target ?? "(unknown)"; | ||
|
berndverst marked this conversation as resolved.
Outdated
|
||
| #else | ||
| return channel.Target ?? "(unknown)"; | ||
| #endif | ||
| } | ||
|
|
||
| sealed class TransportState | ||
| { | ||
| public TransportState(GrpcChannel channel, CallInvoker invoker) | ||
| { | ||
| this.Channel = channel; | ||
| this.Invoker = invoker; | ||
| } | ||
|
|
||
| public GrpcChannel Channel { get; } | ||
|
|
||
| public CallInvoker Invoker { get; } | ||
| } | ||
| } | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.