Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion docs/root/intro/arch_overview/observability/tracing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,8 @@ associated with it. Each span generated by Envoy contains the following data:
* Upstream cluster name, observability name, and address.
* HTTP response status code.
* GRPC response status and message (if available).
* An error tag when HTTP status is 5xx or GRPC status is not "OK".
* An error tag when HTTP status is 5xx or GRPC status is not "OK" and represents a server side error.
See `GRPC's documentation <https://grpc.github.io/grpc/core/md_doc_statuscodes.html>`_ for more information about GRPC status code.
* Tracing system-specific metadata.

The span also includes a name (or operation) which by default is defined as the host of the invoked
Expand Down
1 change: 1 addition & 0 deletions docs/root/version_history/current.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ Minor Behavior Changes
* router: record upstream request timeouts for all the cases and not just for those requests which are awaiting headers. This behavioral change can be temporarily reverted by setting runtime guard ``envoy.reloadable_features.do_not_await_headers_on_upstream_timeout_to_emit_stats`` to false.
* sip-proxy: add customized affinity support by adding :ref:`tra_service_config <envoy_v3_api_msg_extensions.filters.network.sip_proxy.tra.v3alpha.TraServiceConfig>` and :ref:`customized_affinity <envoy_v3_api_msg_extensions.filters.network.sip_proxy.v3alpha.CustomizedAffinity>`.
* sip-proxy: add support for the ``503`` response code. When there is something wrong occurred, send ``503 Service Unavailable`` back to downstream.
* tracing: set tracing error tag for grpc non-ok response code only when it is a upstream error. Client error will not be tagged as a grpc error. This fix is guarded by ``envoy.reloadable_features.update_grpc_response_error_tag``.

Bug Fixes
---------
Expand Down
2 changes: 2 additions & 0 deletions source/common/runtime/runtime_features.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ RUNTIME_GUARD(envoy_reloadable_features_support_locality_update_on_eds_cluster_e
RUNTIME_GUARD(envoy_reloadable_features_test_feature_true);
RUNTIME_GUARD(envoy_reloadable_features_udp_listener_updates_filter_chain_in_place);
RUNTIME_GUARD(envoy_reloadable_features_update_expected_rq_timeout_on_retry);
RUNTIME_GUARD(envoy_reloadable_features_update_grpc_response_error_tag);
RUNTIME_GUARD(envoy_reloadable_features_use_dns_ttl);
RUNTIME_GUARD(envoy_reloadable_features_validate_connect);
RUNTIME_GUARD(envoy_restart_features_explicit_wildcard_resource);
Expand Down Expand Up @@ -167,6 +168,7 @@ constexpr absl::Flag<bool>* runtime_features[] = {
&FLAGS_envoy_reloadable_features_udp_listener_updates_filter_chain_in_place,
&FLAGS_envoy_reloadable_features_unified_mux,
&FLAGS_envoy_reloadable_features_update_expected_rq_timeout_on_retry,
&FLAGS_envoy_reloadable_features_update_grpc_response_error_tag,
&FLAGS_envoy_reloadable_features_use_dns_ttl,
&FLAGS_envoy_reloadable_features_validate_connect,
&FLAGS_envoy_restart_features_explicit_wildcard_resource,
Expand Down
41 changes: 38 additions & 3 deletions source/common/tracing/http_tracer_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,45 @@ static void addGrpcRequestTags(Span& span, const Http::RequestHeaderMap& headers
template <class T> static void addGrpcResponseTags(Span& span, const T& headers) {
addTagIfNotNull(span, Tracing::Tags::get().GrpcStatusCode, headers.GrpcStatus());
addTagIfNotNull(span, Tracing::Tags::get().GrpcMessage, headers.GrpcMessage());
// Set error tag when status is not OK.
// Set error tag when Grpc status code represents an upstream error. See
// https://github.com/envoyproxy/envoy/issues/18877.
absl::optional<Grpc::Status::GrpcStatus> grpc_status_code = Grpc::Common::getGrpcStatus(headers);
if (grpc_status_code && grpc_status_code.value() != Grpc::Status::WellKnownGrpcStatus::Ok) {
span.setTag(Tracing::Tags::get().Error, Tracing::Tags::get().True);
if (Runtime::runtimeFeatureEnabled("envoy.reloadable_features.update_grpc_response_error_tag")) {
if (grpc_status_code.has_value()) {
const auto& status = grpc_status_code.value();
if (status != Grpc::Status::WellKnownGrpcStatus::InvalidCode) {
Comment thread
alyssawilk marked this conversation as resolved.
switch (status) {
// Each case below is considered to be a client side error, therefore should not be
// tagged as an upstream error. See https://grpc.github.io/grpc/core/md_doc_statuscodes.html
// for more details about how each Grpc status code is defined and whether it is an
// upstream error or a client error.
case Grpc::Status::WellKnownGrpcStatus::Ok:
Comment thread
alyssawilk marked this conversation as resolved.
case Grpc::Status::WellKnownGrpcStatus::Canceled:
case Grpc::Status::WellKnownGrpcStatus::InvalidArgument:
case Grpc::Status::WellKnownGrpcStatus::NotFound:
case Grpc::Status::WellKnownGrpcStatus::AlreadyExists:
case Grpc::Status::WellKnownGrpcStatus::PermissionDenied:
case Grpc::Status::WellKnownGrpcStatus::FailedPrecondition:
case Grpc::Status::WellKnownGrpcStatus::Aborted:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hm, canceled says "usually by the caller" but aborted says it's a concurrency issue. Would that be client side or server side error do you think?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

aborted is used due to a concurrency issue which means the client should restart a read-modify-write sequence. I think this would be a client side error.

case Grpc::Status::WellKnownGrpcStatus::OutOfRange:
case Grpc::Status::WellKnownGrpcStatus::Unauthenticated:
break;
case Grpc::Status::WellKnownGrpcStatus::Unknown:
case Grpc::Status::WellKnownGrpcStatus::DeadlineExceeded:
case Grpc::Status::WellKnownGrpcStatus::Unimplemented:
case Grpc::Status::WellKnownGrpcStatus::ResourceExhausted:
case Grpc::Status::WellKnownGrpcStatus::Internal:
case Grpc::Status::WellKnownGrpcStatus::Unavailable:
case Grpc::Status::WellKnownGrpcStatus::DataLoss:
span.setTag(Tracing::Tags::get().Error, Tracing::Tags::get().True);
break;
}
}
}
} else {
if (grpc_status_code && grpc_status_code.value() != Grpc::Status::WellKnownGrpcStatus::Ok) {
span.setTag(Tracing::Tags::get().Error, Tracing::Tags::get().True);
}
}
}

Expand Down
36 changes: 36 additions & 0 deletions test/common/tracing/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,47 @@ licenses(["notice"]) # Apache 2

envoy_package()

envoy_cc_test(
name = "http_tracer_impl_legacy_test",
srcs = [
"http_tracer_impl_test.cc",
],
args = [
"--runtime-feature-disable-for-tests=envoy.reloadable_features.update_grpc_response_error_tag",
],
coverage = True,
deps = [
"//source/common/common:base64_lib",
"//source/common/http:header_map_lib",
"//source/common/http:headers_lib",
"//source/common/http:message_lib",
"//source/common/network:address_lib",
"//source/common/runtime:runtime_lib",
"//source/common/tracing:custom_tag_lib",
"//source/common/tracing:http_tracer_lib",
"//test/mocks/http:http_mocks",
"//test/mocks/local_info:local_info_mocks",
"//test/mocks/router:router_mocks",
"//test/mocks/runtime:runtime_mocks",
"//test/mocks/stats:stats_mocks",
"//test/mocks/thread_local:thread_local_mocks",
"//test/mocks/tracing:tracing_mocks",
"//test/test_common:environment_lib",
"//test/test_common:utility_lib",
"@envoy_api//envoy/config/core/v3:pkg_cc_proto",
"@envoy_api//envoy/type/tracing/v3:pkg_cc_proto",
],
)

envoy_cc_test(
name = "http_tracer_impl_test",
srcs = [
"http_tracer_impl_test.cc",
],
args = [
"--runtime-feature-override-for-tests=envoy.reloadable_features.update_grpc_response_error_tag",
],
coverage = True,
deps = [
"//source/common/common:base64_lib",
"//source/common/http:header_map_lib",
Expand Down
55 changes: 39 additions & 16 deletions test/common/tracing/http_tracer_impl_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -622,8 +622,15 @@ TEST_F(HttpConnManFinalizerImplTest, GrpcErrorTag) {

Http::TestResponseHeaderMapImpl response_headers{{":status", "200"},
{"content-type", "application/grpc"}};
Http::TestResponseTrailerMapImpl response_trailers{{"grpc-status", "7"},
{"grpc-message", "permission denied"}};

Http::TestResponseTrailerMapImpl response_trailers;
if (Runtime::runtimeFeatureEnabled("envoy.reloadable_features.update_grpc_response_error_tag")) {
response_trailers.setGrpcStatus("14");
response_trailers.setGrpcMessage("unavailable");
} else {
response_trailers.setGrpcStatus("7");
response_trailers.setGrpcMessage("permission denied");
}

absl::optional<Http::Protocol> protocol = Http::Protocol::Http2;
absl::optional<uint32_t> response_code(200);
Expand All @@ -634,16 +641,21 @@ TEST_F(HttpConnManFinalizerImplTest, GrpcErrorTag) {
stream_info.downstream_connection_info_provider_->setDirectRemoteAddressForTest(remote_address);

EXPECT_CALL(span, setTag(_, _)).Times(testing::AnyNumber());
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().Error), Eq(Tracing::Tags::get().True)));
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().HttpMethod), Eq("POST")));
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().HttpProtocol), Eq("HTTP/2")));
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().HttpStatusCode), Eq("200")));
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcPath), Eq("/pb.Foo/Bar")));
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcAuthority), Eq("example.com:80")));
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcContentType), Eq("application/grpc")));
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcTimeout), Eq("10s")));
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcStatusCode), Eq("7")));
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcMessage), Eq("permission denied")));
if (Runtime::runtimeFeatureEnabled("envoy.reloadable_features.update_grpc_response_error_tag")) {
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcStatusCode), Eq("14")));
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcMessage), Eq("unavailable")));
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().Error), Eq(Tracing::Tags::get().True)));
} else {
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcStatusCode), Eq("7")));
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcMessage), Eq("permission denied")));
}
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().PeerAddress), Eq(expected_ip)));

HttpTracerUtility::finalizeDownstreamSpan(span, &request_headers, &response_headers,
Expand All @@ -665,9 +677,16 @@ TEST_F(HttpConnManFinalizerImplTest, GrpcTrailersOnly) {
{"te", "trailers"}};

Http::TestResponseHeaderMapImpl response_headers{{":status", "200"},
{"content-type", "application/grpc"},
{"grpc-status", "7"},
{"grpc-message", "permission denied"}};
{"content-type", "application/grpc"}};

if (Runtime::runtimeFeatureEnabled("envoy.reloadable_features.update_grpc_response_error_tag")) {
response_headers.setGrpcStatus("14");
response_headers.setGrpcMessage("unavailable");
} else {
response_headers.setGrpcStatus("7");
response_headers.setGrpcMessage("permission denied");
}

Http::TestResponseTrailerMapImpl response_trailers;

absl::optional<Http::Protocol> protocol = Http::Protocol::Http2;
Expand All @@ -679,15 +698,20 @@ TEST_F(HttpConnManFinalizerImplTest, GrpcTrailersOnly) {
stream_info.downstream_connection_info_provider_->setDirectRemoteAddressForTest(remote_address);

EXPECT_CALL(span, setTag(_, _)).Times(testing::AnyNumber());
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().Error), Eq(Tracing::Tags::get().True)));
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().HttpMethod), Eq("POST")));
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().HttpProtocol), Eq("HTTP/2")));
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().HttpStatusCode), Eq("200")));
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcPath), Eq("/pb.Foo/Bar")));
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcAuthority), Eq("example.com:80")));
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcContentType), Eq("application/grpc")));
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcStatusCode), Eq("7")));
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcMessage), Eq("permission denied")));
if (Runtime::runtimeFeatureEnabled("envoy.reloadable_features.update_grpc_response_error_tag")) {
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcStatusCode), Eq("14")));
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcMessage), Eq("unavailable")));
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().Error), Eq(Tracing::Tags::get().True)));
} else {
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcStatusCode), Eq("7")));
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcMessage), Eq("permission denied")));
}
EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().PeerAddress), Eq(expected_ip)));

HttpTracerUtility::finalizeDownstreamSpan(span, &request_headers, &response_headers,
Expand Down Expand Up @@ -740,13 +764,12 @@ class HttpTracerImplTest : public testing::Test {
Upstream::HostDescriptionConstSharedPtr shared_host(host_);
stream_info_.upstreamInfo()->setUpstreamHost(shared_host);
}

Http::TestRequestHeaderMapImpl request_headers_{
{":path", "/"}, {":method", "GET"}, {"x-request-id", "foo"}, {":authority", "test"}};
Http::TestResponseHeaderMapImpl response_headers_{{":status", "200"},
{"content-type", "application/grpc"},
{"grpc-status", "7"},
{"grpc-message", "permission denied"}};
{"grpc-status", "14"},
{"grpc-message", "unavailable"}};
Http::TestResponseTrailerMapImpl response_trailers_;
NiceMock<StreamInfo::MockStreamInfo> stream_info_;
NiceMock<LocalInfo::MockLocalInfo> local_info_;
Expand Down Expand Up @@ -823,8 +846,8 @@ TEST_F(HttpTracerImplTest, ChildUpstreamSpanTest) {
EXPECT_CALL(*second_span,
setTag(Eq(Tracing::Tags::get().UpstreamClusterName), Eq("ob fake cluster")));
EXPECT_CALL(*second_span, setTag(Eq(Tracing::Tags::get().HttpStatusCode), Eq("200")));
EXPECT_CALL(*second_span, setTag(Eq(Tracing::Tags::get().GrpcStatusCode), Eq("7")));
EXPECT_CALL(*second_span, setTag(Eq(Tracing::Tags::get().GrpcMessage), Eq("permission denied")));
EXPECT_CALL(*second_span, setTag(Eq(Tracing::Tags::get().GrpcStatusCode), Eq("14")));
EXPECT_CALL(*second_span, setTag(Eq(Tracing::Tags::get().GrpcMessage), Eq("unavailable")));
EXPECT_CALL(*second_span, setTag(Eq(Tracing::Tags::get().Error), Eq(Tracing::Tags::get().True)));

HttpTracerUtility::finalizeUpstreamSpan(*child_span, &response_headers_, &response_trailers_,
Expand Down