-
Notifications
You must be signed in to change notification settings - Fork 5.5k
grpc stream: reduce log level depending on remote close status #17300
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
643e9ff
f1ffe40
0741a6b
bc53b0a
91c34bd
ddb10b2
ca83f79
a842b98
ba64bc1
1d88d72
970e5e2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -15,6 +15,14 @@ | |||||
| namespace Envoy { | ||||||
| namespace Config { | ||||||
|
|
||||||
| namespace { | ||||||
|
|
||||||
| // TODO(htuch): Make this configurable. | ||||||
| constexpr uint32_t RetryInitialDelayMs = 500; | ||||||
| constexpr uint32_t RetryMaxDelayMs = 30000; // Do not cross more than 30s | ||||||
|
|
||||||
| } // namespace | ||||||
|
|
||||||
| template <class ResponseProto> using ResponseProtoPtr = std::unique_ptr<ResponseProto>; | ||||||
|
|
||||||
| // Oversees communication for gRPC xDS implementations (parent to both regular xDS and delta | ||||||
|
|
@@ -45,9 +53,6 @@ class GrpcStream : public Grpc::AsyncStreamCallbacks<ResponseProto>, | |||||
| }); | ||||||
| } | ||||||
|
|
||||||
| // TODO(htuch): Make this configurable. | ||||||
| static constexpr uint32_t RetryInitialDelayMs = 500; | ||||||
| static constexpr uint32_t RetryMaxDelayMs = 30000; // Do not cross more than 30s | ||||||
| backoff_strategy_ = std::make_unique<JitteredExponentialBackOffStrategy>( | ||||||
| RetryInitialDelayMs, RetryMaxDelayMs, random_); | ||||||
| } | ||||||
|
|
@@ -60,12 +65,13 @@ class GrpcStream : public Grpc::AsyncStreamCallbacks<ResponseProto>, | |||||
| } | ||||||
| stream_ = async_client_->start(service_method_, *this, Http::AsyncClient::StreamOptions()); | ||||||
| if (stream_ == nullptr) { | ||||||
| ENVOY_LOG(warn, "Unable to establish new stream"); | ||||||
| ENVOY_LOG(debug, "Unable to establish new stream"); | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn't this one pretty important though if the config server is completely busted? (No healthy hosts, etc.) IMO we should make this error message easier to understand:
Suggested change
But also potentially rate limit the output? WDYT?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. #14616 (comment) aserted that this message is always accompanied by the remote close message. Tracing code (see e.g. code around here), this does seem to be the case at least for
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you are sure it always prints together I think it's fine to downgrade, but I would confirm with manual testing. The issue here is I don't think every case that this would get printed in would result in a remote close, for example no healthy host. |
||||||
| callbacks_->onEstablishmentFailure(); | ||||||
| setRetryTimer(); | ||||||
| return; | ||||||
| } | ||||||
| control_plane_stats_.connected_state_.set(1); | ||||||
| clearCloseStatus(); | ||||||
| callbacks_->onStreamEstablished(); | ||||||
| } | ||||||
|
|
||||||
|
|
@@ -97,8 +103,7 @@ class GrpcStream : public Grpc::AsyncStreamCallbacks<ResponseProto>, | |||||
| } | ||||||
|
|
||||||
| void onRemoteClose(Grpc::Status::GrpcStatus status, const std::string& message) override { | ||||||
| ENVOY_LOG(warn, "{} gRPC config stream closed: {}, {}", service_method_.name(), status, | ||||||
| message); | ||||||
| logClose(status, message); | ||||||
| stream_ = nullptr; | ||||||
| control_plane_stats_.connected_state_.set(0); | ||||||
| callbacks_->onEstablishmentFailure(); | ||||||
|
|
@@ -131,11 +136,78 @@ class GrpcStream : public Grpc::AsyncStreamCallbacks<ResponseProto>, | |||||
| return false; | ||||||
| } | ||||||
|
|
||||||
| absl::optional<Grpc::Status::GrpcStatus> getCloseStatus() { return close_status_; } | ||||||
|
|
||||||
| private: | ||||||
| void setRetryTimer() { | ||||||
| retry_timer_->enableTimer(std::chrono::milliseconds(backoff_strategy_->nextBackOffMs())); | ||||||
| } | ||||||
|
|
||||||
| // https://github.com/envoyproxy/envoy/issues/14591 | ||||||
|
tbarrella marked this conversation as resolved.
Outdated
|
||||||
| // Log level should be reduced when the remote close failure is `Ok` or is retriable and has only | ||||||
| // been occurring for a short amount of time. | ||||||
| void logClose(Grpc::Status::GrpcStatus status, const std::string& message) { | ||||||
| if (Grpc::Status::WellKnownGrpcStatus::Ok == status) { | ||||||
| ENVOY_LOG(debug, "{} gRPC config stream closed: {}, {}", service_method_.name(), status, | ||||||
| message); | ||||||
| return; | ||||||
| } | ||||||
|
|
||||||
| if (!onlyWarnOnRepeatedFailure(status)) { | ||||||
| // When the failure is considered non-retriable, warn. | ||||||
| ENVOY_LOG(warn, "{} gRPC config stream closed: {}, {}", service_method_.name(), status, | ||||||
| message); | ||||||
| return; | ||||||
| } | ||||||
|
|
||||||
| if (!isCloseStatusSet()) { | ||||||
| // For the first failure, record its occurrence and log at the debug level. | ||||||
| ENVOY_LOG(debug, "{} gRPC config stream closed: {}, {}", service_method_.name(), status, | ||||||
| message); | ||||||
| setCloseStatus(status, message); | ||||||
| return; | ||||||
| } | ||||||
|
|
||||||
| uint64_t ms_since_first_close = std::chrono::duration_cast<std::chrono::milliseconds>( | ||||||
|
tbarrella marked this conversation as resolved.
Outdated
|
||||||
| time_source_.monotonicTime() - close_time_) | ||||||
| .count(); | ||||||
| Grpc::Status::GrpcStatus close_status = close_status_.value(); | ||||||
|
|
||||||
| if (status != close_status) { | ||||||
| // This is a different failure. Warn on both statuses and remember the new one. | ||||||
| ENVOY_LOG(warn, "{} gRPC config stream closed: {}, {} (previously {}, {} since {}ms ago)", | ||||||
| service_method_.name(), status, message, close_status, close_message_, | ||||||
| ms_since_first_close); | ||||||
| setCloseStatus(status, message); | ||||||
| return; | ||||||
| } | ||||||
|
|
||||||
| if (ms_since_first_close > RetryMaxDelayMs) { | ||||||
| // Warn if we are over the time limit. | ||||||
| ENVOY_LOG(warn, "{} gRPC config stream closed since {}ms ago: {}, {}", service_method_.name(), | ||||||
| ms_since_first_close, close_status, close_message_); | ||||||
| return; | ||||||
| } | ||||||
|
|
||||||
| // Failure is retriable and new enough to only log at the debug level. | ||||||
| ENVOY_LOG(debug, "{} gRPC config stream closed: {}, {}", service_method_.name(), status, | ||||||
| message); | ||||||
| } | ||||||
|
|
||||||
| bool onlyWarnOnRepeatedFailure(Grpc::Status::GrpcStatus status) { | ||||||
|
tbarrella marked this conversation as resolved.
Outdated
|
||||||
| return Grpc::Status::WellKnownGrpcStatus::DeadlineExceeded == status || | ||||||
| Grpc::Status::WellKnownGrpcStatus::Unavailable == status; | ||||||
|
Comment on lines
+198
to
+208
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add some comments on how you decided these? Looking at https://grpc.github.io/grpc/core/md_doc_statuscodes.html I would naively assume
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, will add a comment. I could see why
|
||||||
| } | ||||||
|
|
||||||
| void clearCloseStatus() { close_status_ = absl::nullopt; } | ||||||
| bool isCloseStatusSet() { return close_status_.has_value(); } | ||||||
|
|
||||||
| void setCloseStatus(Grpc::Status::GrpcStatus status, const std::string& message) { | ||||||
| close_status_ = status; | ||||||
| close_time_ = time_source_.monotonicTime(); | ||||||
| close_message_ = message; | ||||||
| } | ||||||
|
|
||||||
| GrpcStreamCallbacks<ResponseProto>* const callbacks_; | ||||||
|
|
||||||
| Grpc::AsyncClient<RequestProto, ResponseProto> async_client_; | ||||||
|
|
@@ -153,6 +225,12 @@ class GrpcStream : public Grpc::AsyncStreamCallbacks<ResponseProto>, | |||||
| TokenBucketPtr limit_request_; | ||||||
| const bool rate_limiting_enabled_; | ||||||
| Event::TimerPtr drain_request_timer_; | ||||||
|
|
||||||
| // Records the initial message and timestamp of the most recent remote closes with the same | ||||||
|
tbarrella marked this conversation as resolved.
|
||||||
| // status. | ||||||
| absl::optional<Grpc::Status::GrpcStatus> close_status_ = absl::nullopt; | ||||||
|
tbarrella marked this conversation as resolved.
Outdated
|
||||||
| std::string close_message_; | ||||||
|
tbarrella marked this conversation as resolved.
Outdated
|
||||||
| MonotonicTime close_time_; | ||||||
|
tbarrella marked this conversation as resolved.
Outdated
|
||||||
| }; | ||||||
|
|
||||||
| } // namespace Config | ||||||
|
|
||||||
Uh oh!
There was an error while loading. Please reload this page.