-
Notifications
You must be signed in to change notification settings - Fork 5.5k
grpc stream: reduce log level depending on remote close status #17300
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
643e9ff
f1ffe40
0741a6b
bc53b0a
91c34bd
ddb10b2
ca83f79
a842b98
ba64bc1
1d88d72
970e5e2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -15,6 +15,19 @@ | |||||
| namespace Envoy { | ||||||
| namespace Config { | ||||||
|
|
||||||
| namespace { | ||||||
|
|
||||||
| constexpr auto CloseLogMessage = "{} gRPC config stream closed: {}, {}"; | ||||||
| constexpr auto CloseLogMessageWithSince = "{} gRPC config stream closed since {}ms ago: {}, {}"; | ||||||
| constexpr auto CloseLogMessageWithPrevious = | ||||||
| "{} gRPC config stream closed: {}, {} (previously {}, {} since {}ms ago)"; | ||||||
|
|
||||||
| // TODO(htuch): Make this configurable. | ||||||
| constexpr uint32_t RetryInitialDelayMs = 500; | ||||||
| constexpr uint32_t RetryMaxDelayMs = 30000; // Do not cross more than 30s | ||||||
|
|
||||||
| } // namespace | ||||||
|
|
||||||
| template <class ResponseProto> using ResponseProtoPtr = std::unique_ptr<ResponseProto>; | ||||||
|
|
||||||
| // Oversees communication for gRPC xDS implementations (parent to both regular xDS and delta | ||||||
|
|
@@ -45,9 +58,6 @@ class GrpcStream : public Grpc::AsyncStreamCallbacks<ResponseProto>, | |||||
| }); | ||||||
| } | ||||||
|
|
||||||
| // TODO(htuch): Make this configurable. | ||||||
| static constexpr uint32_t RetryInitialDelayMs = 500; | ||||||
| static constexpr uint32_t RetryMaxDelayMs = 30000; // Do not cross more than 30s | ||||||
| backoff_strategy_ = std::make_unique<JitteredExponentialBackOffStrategy>( | ||||||
| RetryInitialDelayMs, RetryMaxDelayMs, random_); | ||||||
| } | ||||||
|
|
@@ -60,12 +70,13 @@ class GrpcStream : public Grpc::AsyncStreamCallbacks<ResponseProto>, | |||||
| } | ||||||
| stream_ = async_client_->start(service_method_, *this, Http::AsyncClient::StreamOptions()); | ||||||
| if (stream_ == nullptr) { | ||||||
| ENVOY_LOG(warn, "Unable to establish new stream"); | ||||||
| ENVOY_LOG(debug, "Unable to establish new stream"); | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn't this one pretty important though if the config server is completely busted? (No healthy hosts, etc.) IMO we should make this error message easier to understand:
Suggested change
But also potentially rate limit the output? WDYT?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. #14616 (comment) aserted that this message is always accompanied by the remote close message. Tracing code (see e.g. code around here), this does seem to be the case at least for
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you are sure it always prints together I think it's fine to downgrade, but I would confirm with manual testing. The issue here is I don't think every case that this would get printed in would result in a remote close, for example no healthy host. |
||||||
| callbacks_->onEstablishmentFailure(); | ||||||
| setRetryTimer(); | ||||||
| return; | ||||||
| } | ||||||
| control_plane_stats_.connected_state_.set(1); | ||||||
| unsetCloseStatus(); | ||||||
| callbacks_->onStreamEstablished(); | ||||||
| } | ||||||
|
|
||||||
|
|
@@ -97,8 +108,7 @@ class GrpcStream : public Grpc::AsyncStreamCallbacks<ResponseProto>, | |||||
| } | ||||||
|
|
||||||
| void onRemoteClose(Grpc::Status::GrpcStatus status, const std::string& message) override { | ||||||
| ENVOY_LOG(warn, "{} gRPC config stream closed: {}, {}", service_method_.name(), status, | ||||||
| message); | ||||||
| logClose(status, message); | ||||||
| stream_ = nullptr; | ||||||
| control_plane_stats_.connected_state_.set(0); | ||||||
| callbacks_->onEstablishmentFailure(); | ||||||
|
|
@@ -131,11 +141,74 @@ class GrpcStream : public Grpc::AsyncStreamCallbacks<ResponseProto>, | |||||
| return false; | ||||||
| } | ||||||
|
|
||||||
| absl::optional<Grpc::Status::GrpcStatus> getCloseStatus() { return close_status_; } | ||||||
|
|
||||||
| private: | ||||||
| void setRetryTimer() { | ||||||
| retry_timer_->enableTimer(std::chrono::milliseconds(backoff_strategy_->nextBackOffMs())); | ||||||
| } | ||||||
|
|
||||||
| // https://github.com/envoyproxy/envoy/issues/14591 | ||||||
|
tbarrella marked this conversation as resolved.
Outdated
|
||||||
| // Log level should be reduced when the remote close failure is `Ok` or is retriable and has only | ||||||
| // been occurring for a short amount of time. | ||||||
| void logClose(Grpc::Status::GrpcStatus status, const std::string& message) { | ||||||
| if (Grpc::Status::WellKnownGrpcStatus::Ok == status) { | ||||||
| ENVOY_LOG(debug, CloseLogMessage, service_method_.name(), status, message); | ||||||
| return; | ||||||
| } | ||||||
|
|
||||||
| if (!onlyWarnOnRepeatedFailure(status)) { | ||||||
| // Failure is considered non-retriable. Warn. | ||||||
|
tbarrella marked this conversation as resolved.
Outdated
|
||||||
| ENVOY_LOG(warn, CloseLogMessage, service_method_.name(), status, message); | ||||||
| return; | ||||||
| } | ||||||
|
|
||||||
| if (!isCloseStatusSet()) { | ||||||
| // First failure. Debug. Record occurrence. | ||||||
|
tbarrella marked this conversation as resolved.
Outdated
|
||||||
| ENVOY_LOG(debug, CloseLogMessage, service_method_.name(), status, message); | ||||||
| setCloseStatus(status, message); | ||||||
| return; | ||||||
| } | ||||||
|
|
||||||
| uint64_t ms_since_first_close = std::chrono::duration_cast<std::chrono::milliseconds>( | ||||||
|
tbarrella marked this conversation as resolved.
Outdated
|
||||||
| time_source_.monotonicTime() - close_time_) | ||||||
| .count(); | ||||||
| Grpc::Status::GrpcStatus close_status = close_status_.value(); | ||||||
|
|
||||||
| if (status != close_status) { | ||||||
| // This is a different failure. Warn on both statuses and remember the new one. | ||||||
| ENVOY_LOG(warn, CloseLogMessageWithPrevious, service_method_.name(), status, message, | ||||||
|
tbarrella marked this conversation as resolved.
Outdated
|
||||||
| close_status, close_message_, ms_since_first_close); | ||||||
| setCloseStatus(status, message); | ||||||
| return; | ||||||
| } | ||||||
|
|
||||||
| if (ms_since_first_close > RetryMaxDelayMs) { | ||||||
| // Warn if we are over the time limit. | ||||||
| ENVOY_LOG(warn, CloseLogMessageWithSince, service_method_.name(), ms_since_first_close, | ||||||
|
tbarrella marked this conversation as resolved.
Outdated
|
||||||
| close_status, close_message_); | ||||||
| return; | ||||||
| } | ||||||
|
|
||||||
| // Failure is retriable and new enough to only log at the debug level. | ||||||
| ENVOY_LOG(debug, CloseLogMessage, service_method_.name(), status, message); | ||||||
|
tbarrella marked this conversation as resolved.
Outdated
|
||||||
| } | ||||||
|
|
||||||
| bool onlyWarnOnRepeatedFailure(Grpc::Status::GrpcStatus status) { | ||||||
|
tbarrella marked this conversation as resolved.
Outdated
|
||||||
| return Grpc::Status::WellKnownGrpcStatus::Unavailable == status || | ||||||
| Grpc::Status::WellKnownGrpcStatus::DeadlineExceeded == status || | ||||||
| Grpc::Status::WellKnownGrpcStatus::Internal == status; | ||||||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is included due to the example in #14591 (error 13), but gRPC docs classify this as a serious error:
I'm questioning whether the server should return this in the first place if it doesn't seem to be serious. @howardjohn @kyessenov @mandarjog thoughts?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am pretty sure FWIW if you want to test you can set
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you. That makes sense to me since it's happening every 30 minutes in that example. It looks like in recent Istio (1.10.2) the recurring error is now 14 (Unavailable)
After building an Envoy with this change and not including this line/treating Internal as retriable, I no longer got warnings for the above error. So I think we should not special case Internal for now and only DeadlineExceeded/Unavailable |
||||||
| } | ||||||
|
|
||||||
| void unsetCloseStatus() { close_status_ = absl::nullopt; } | ||||||
|
tbarrella marked this conversation as resolved.
Outdated
|
||||||
| bool isCloseStatusSet() { return close_status_.has_value(); } | ||||||
|
|
||||||
| void setCloseStatus(Grpc::Status::GrpcStatus status, const std::string& message) { | ||||||
| close_status_ = status; | ||||||
| close_time_ = time_source_.monotonicTime(); | ||||||
| close_message_ = message; | ||||||
| } | ||||||
|
|
||||||
| GrpcStreamCallbacks<ResponseProto>* const callbacks_; | ||||||
|
|
||||||
| Grpc::AsyncClient<RequestProto, ResponseProto> async_client_; | ||||||
|
|
@@ -153,6 +226,12 @@ class GrpcStream : public Grpc::AsyncStreamCallbacks<ResponseProto>, | |||||
| TokenBucketPtr limit_request_; | ||||||
| const bool rate_limiting_enabled_; | ||||||
| Event::TimerPtr drain_request_timer_; | ||||||
|
|
||||||
| // Records the initial message and timestamp of the most recent remote closes with the same | ||||||
|
tbarrella marked this conversation as resolved.
|
||||||
| // status. | ||||||
| absl::optional<Grpc::Status::GrpcStatus> close_status_ = absl::nullopt; | ||||||
|
tbarrella marked this conversation as resolved.
Outdated
|
||||||
| std::string close_message_; | ||||||
|
tbarrella marked this conversation as resolved.
Outdated
|
||||||
| MonotonicTime close_time_; | ||||||
|
tbarrella marked this conversation as resolved.
Outdated
|
||||||
| }; | ||||||
|
|
||||||
| } // namespace Config | ||||||
|
|
||||||
Uh oh!
There was an error while loading. Please reload this page.