-
Notifications
You must be signed in to change notification settings - Fork 4.6k
peering: retry establishing connection more quickly on certain errors #13938
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
4d5373c
peering: retry some disconnect errors more quickly
lkysow 7a80cca
Fix test
lkysow 026e4d8
Move function down to bottom
lkysow ac8f97b
Add some tests
lkysow c269684
Test retryLoopBackoffPeering
lkysow 95d1e8b
Exit loop on success
lkysow File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,6 +6,7 @@ import ( | |
| "crypto/tls" | ||
| "crypto/x509" | ||
| "fmt" | ||
| "math" | ||
| "time" | ||
|
|
||
| "github.com/armon/go-metrics" | ||
|
|
@@ -16,8 +17,10 @@ import ( | |
| "github.com/hashicorp/go-uuid" | ||
| "golang.org/x/time/rate" | ||
| "google.golang.org/grpc" | ||
| "google.golang.org/grpc/codes" | ||
| "google.golang.org/grpc/credentials" | ||
| "google.golang.org/grpc/keepalive" | ||
| grpcstatus "google.golang.org/grpc/status" | ||
|
|
||
| "github.com/hashicorp/consul/acl" | ||
| "github.com/hashicorp/consul/agent/consul/state" | ||
|
|
@@ -38,6 +41,15 @@ var LeaderPeeringMetrics = []prometheus.GaugeDefinition{ | |
| "We emit this metric every 9 seconds", | ||
| }, | ||
| } | ||
| var ( | ||
| // fastConnRetryTimeout is how long we wait between retrying connections following the "fast" path | ||
| // which is triggered on specific connection errors. | ||
| fastConnRetryTimeout = 8 * time.Millisecond | ||
| // maxFastConnRetries is the maximum number of fast connection retries before we follow exponential backoff. | ||
| maxFastConnRetries = uint(5) | ||
| // maxFastRetryBackoff is the maximum amount of time we'll wait between retries following the fast path. | ||
| maxFastRetryBackoff = 8192 * time.Millisecond | ||
| ) | ||
|
|
||
| func (s *Server) startPeeringStreamSync(ctx context.Context) { | ||
| s.leaderRoutineManager.Start(ctx, peeringStreamsRoutineName, s.runPeeringSync) | ||
|
|
@@ -300,7 +312,7 @@ func (s *Server) establishStream(ctx context.Context, logger hclog.Logger, peer | |
| } | ||
|
|
||
| // Establish a stream-specific retry so that retrying stream/conn errors isn't dependent on state store changes. | ||
| go retryLoopBackoff(retryCtx, func() error { | ||
| go retryLoopBackoffPeering(retryCtx, logger, func() error { | ||
| // Try a new address on each iteration by advancing the ring buffer on errors. | ||
| defer func() { | ||
| buffer = buffer.Next() | ||
|
|
@@ -349,18 +361,21 @@ func (s *Server) establishStream(ctx context.Context, logger hclog.Logger, peer | |
| if err == nil { | ||
| stream.CloseSend() | ||
| s.peerStreamServer.DrainStream(streamReq) | ||
|
|
||
| // This will cancel the retry-er context, letting us break out of this loop when we want to shut down the stream. | ||
| cancel() | ||
|
|
||
| logger.Info("closed outbound stream") | ||
| } | ||
| return err | ||
|
|
||
| }, func(err error) { | ||
| // TODO(peering): why are we using TrackSendError here? This could also be a receive error. | ||
| streamStatus.TrackSendError(err.Error()) | ||
| logger.Error("error managing peering stream", "peer_id", peer.ID, "error", err) | ||
| }) | ||
| if isFailedPreconditionErr(err) { | ||
| logger.Debug("stream disconnected due to 'failed precondition' error; reconnecting", | ||
| "error", err) | ||
| return | ||
| } | ||
| logger.Error("error managing peering stream", "error", err) | ||
| }, peeringRetryTimeout) | ||
|
|
||
| return nil | ||
| } | ||
|
|
@@ -517,3 +532,84 @@ func (s *Server) deleteTrustBundleFromPeer(ctx context.Context, limiter *rate.Li | |
| _, err = s.raftApplyProtobuf(structs.PeeringTrustBundleDeleteType, req) | ||
| return err | ||
| } | ||
|
|
||
| // retryLoopBackoffPeering re-runs loopFn with a backoff on error. errFn is run whenever | ||
| // loopFn returns an error. retryTimeFn is used to calculate the time between retries on error. | ||
| // It is passed the number of errors in a row that loopFn has returned and the latest error | ||
| // from loopFn. | ||
| // | ||
| // This function is modelled off of retryLoopBackoffHandleSuccess but is specific to peering | ||
| // because peering needs to use different retry times depending on which error is returned. | ||
| // This function doesn't use a rate limiter, unlike retryLoopBackoffHandleSuccess, because | ||
| // the rate limiter is only needed in the success case when loopFn returns nil and we want to | ||
| // loop again. In the peering case, we exit on a successful loop so we don't need the limter. | ||
| func retryLoopBackoffPeering(ctx context.Context, logger hclog.Logger, loopFn func() error, errFn func(error), | ||
| retryTimeFn func(failedAttempts uint, loopErr error) time.Duration) { | ||
| var failedAttempts uint | ||
| var err error | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. err doesn't need to be here |
||
| for { | ||
| if err = loopFn(); err != nil { | ||
| errFn(err) | ||
|
|
||
| if failedAttempts < math.MaxUint { | ||
| failedAttempts++ | ||
| } | ||
|
|
||
| retryTime := retryTimeFn(failedAttempts, err) | ||
| logger.Trace("in connection retry backoff", "delay", retryTime) | ||
| timer := time.NewTimer(retryTime) | ||
|
|
||
| select { | ||
| case <-ctx.Done(): | ||
| timer.Stop() | ||
| return | ||
| case <-timer.C: | ||
| } | ||
| continue | ||
| } | ||
| return | ||
| } | ||
| } | ||
|
|
||
| // peeringRetryTimeout returns the time that should be waited between re-establishing a peering | ||
| // connection after an error. We follow the default backoff from retryLoopBackoff | ||
| // unless the error is a "failed precondition" error in which case we retry much more quickly. | ||
| // Retrying quickly is important in the case of a failed precondition error because we expect it to resolve | ||
| // quickly. For example in the case of connecting with a follower through a load balancer, we just need to retry | ||
| // until our request lands on a leader. | ||
| func peeringRetryTimeout(failedAttempts uint, loopErr error) time.Duration { | ||
| if loopErr != nil && isFailedPreconditionErr(loopErr) { | ||
| // Wait a constant time for the first number of retries. | ||
| if failedAttempts <= maxFastConnRetries { | ||
| return fastConnRetryTimeout | ||
| } | ||
| // From here, follow an exponential backoff maxing out at maxFastRetryBackoff. | ||
| // The below equation multiples the constantRetryTimeout by 2^n where n is the number of failed attempts | ||
| // we're on, starting at 1 now that we're past our maxFastConnRetries. | ||
| // For example if fastConnRetryTimeout == 8ms and maxFastConnRetries == 5, then at 6 failed retries | ||
| // we'll do 8ms * 2^1 = 16ms, then 8ms * 2^2 = 32ms, etc. | ||
| ms := fastConnRetryTimeout * (1 << (failedAttempts - maxFastConnRetries)) | ||
| if ms > maxFastRetryBackoff { | ||
| return maxFastRetryBackoff | ||
| } | ||
| return ms | ||
| } | ||
|
|
||
| // Else we go with the default backoff from retryLoopBackoff. | ||
| if (1 << failedAttempts) < maxRetryBackoff { | ||
| return (1 << failedAttempts) * time.Second | ||
| } | ||
| return time.Duration(maxRetryBackoff) * time.Second | ||
| } | ||
|
|
||
| // isFailedPreconditionErr returns true if err is a gRPC error with code FailedPrecondition. | ||
| func isFailedPreconditionErr(err error) bool { | ||
| if err == nil { | ||
| return false | ||
| } | ||
| grpcErr, ok := grpcstatus.FromError(err) | ||
| if !ok { | ||
| return false | ||
| } | ||
| return grpcErr.Code() == codes.FailedPrecondition | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Log errors differently so as to not spam logs with not actual errors.