diff --git a/docs/root/intro/arch_overview/observability/tracing.rst b/docs/root/intro/arch_overview/observability/tracing.rst index ece8d4e8b807c..db4c9c92ed6d3 100644 --- a/docs/root/intro/arch_overview/observability/tracing.rst +++ b/docs/root/intro/arch_overview/observability/tracing.rst @@ -134,7 +134,8 @@ associated with it. Each span generated by Envoy contains the following data: * Upstream cluster name, observability name, and address. * HTTP response status code. * GRPC response status and message (if available). -* An error tag when HTTP status is 5xx or GRPC status is not "OK". +* An error tag when HTTP status is 5xx or GRPC status is not "OK" and represents a server side error. + See `GRPC's documentation `_ for more information about GRPC status code. * Tracing system-specific metadata. The span also includes a name (or operation) which by default is defined as the host of the invoked diff --git a/docs/root/version_history/current.rst b/docs/root/version_history/current.rst index b53bfa4846fac..6061fbc67fcc6 100644 --- a/docs/root/version_history/current.rst +++ b/docs/root/version_history/current.rst @@ -24,6 +24,7 @@ Minor Behavior Changes * router: record upstream request timeouts for all the cases and not just for those requests which are awaiting headers. This behavioral change can be temporarily reverted by setting runtime guard ``envoy.reloadable_features.do_not_await_headers_on_upstream_timeout_to_emit_stats`` to false. * sip-proxy: add customized affinity support by adding :ref:`tra_service_config ` and :ref:`customized_affinity `. * sip-proxy: add support for the ``503`` response code. When there is something wrong occurred, send ``503 Service Unavailable`` back to downstream. +* tracing: set tracing error tag for grpc non-ok response code only when it is a upstream error. Client error will not be tagged as a grpc error. This fix is guarded by ``envoy.reloadable_features.update_grpc_response_error_tag``. Bug Fixes --------- diff --git a/source/common/runtime/runtime_features.cc b/source/common/runtime/runtime_features.cc index ab63a8c5a79a3..47081ea97532b 100644 --- a/source/common/runtime/runtime_features.cc +++ b/source/common/runtime/runtime_features.cc @@ -51,6 +51,7 @@ RUNTIME_GUARD(envoy_reloadable_features_support_locality_update_on_eds_cluster_e RUNTIME_GUARD(envoy_reloadable_features_test_feature_true); RUNTIME_GUARD(envoy_reloadable_features_udp_listener_updates_filter_chain_in_place); RUNTIME_GUARD(envoy_reloadable_features_update_expected_rq_timeout_on_retry); +RUNTIME_GUARD(envoy_reloadable_features_update_grpc_response_error_tag); RUNTIME_GUARD(envoy_reloadable_features_use_dns_ttl); RUNTIME_GUARD(envoy_reloadable_features_validate_connect); RUNTIME_GUARD(envoy_restart_features_explicit_wildcard_resource); @@ -167,6 +168,7 @@ constexpr absl::Flag* runtime_features[] = { &FLAGS_envoy_reloadable_features_udp_listener_updates_filter_chain_in_place, &FLAGS_envoy_reloadable_features_unified_mux, &FLAGS_envoy_reloadable_features_update_expected_rq_timeout_on_retry, + &FLAGS_envoy_reloadable_features_update_grpc_response_error_tag, &FLAGS_envoy_reloadable_features_use_dns_ttl, &FLAGS_envoy_reloadable_features_validate_connect, &FLAGS_envoy_restart_features_explicit_wildcard_resource, diff --git a/source/common/tracing/http_tracer_impl.cc b/source/common/tracing/http_tracer_impl.cc index d72f0275c03a6..f14fe8f832c8d 100644 --- a/source/common/tracing/http_tracer_impl.cc +++ b/source/common/tracing/http_tracer_impl.cc @@ -84,10 +84,45 @@ static void addGrpcRequestTags(Span& span, const Http::RequestHeaderMap& headers template static void addGrpcResponseTags(Span& span, const T& headers) { addTagIfNotNull(span, Tracing::Tags::get().GrpcStatusCode, headers.GrpcStatus()); addTagIfNotNull(span, Tracing::Tags::get().GrpcMessage, headers.GrpcMessage()); - // Set error tag when status is not OK. + // Set error tag when Grpc status code represents an upstream error. See + // https://github.com/envoyproxy/envoy/issues/18877. absl::optional grpc_status_code = Grpc::Common::getGrpcStatus(headers); - if (grpc_status_code && grpc_status_code.value() != Grpc::Status::WellKnownGrpcStatus::Ok) { - span.setTag(Tracing::Tags::get().Error, Tracing::Tags::get().True); + if (Runtime::runtimeFeatureEnabled("envoy.reloadable_features.update_grpc_response_error_tag")) { + if (grpc_status_code.has_value()) { + const auto& status = grpc_status_code.value(); + if (status != Grpc::Status::WellKnownGrpcStatus::InvalidCode) { + switch (status) { + // Each case below is considered to be a client side error, therefore should not be + // tagged as an upstream error. See https://grpc.github.io/grpc/core/md_doc_statuscodes.html + // for more details about how each Grpc status code is defined and whether it is an + // upstream error or a client error. + case Grpc::Status::WellKnownGrpcStatus::Ok: + case Grpc::Status::WellKnownGrpcStatus::Canceled: + case Grpc::Status::WellKnownGrpcStatus::InvalidArgument: + case Grpc::Status::WellKnownGrpcStatus::NotFound: + case Grpc::Status::WellKnownGrpcStatus::AlreadyExists: + case Grpc::Status::WellKnownGrpcStatus::PermissionDenied: + case Grpc::Status::WellKnownGrpcStatus::FailedPrecondition: + case Grpc::Status::WellKnownGrpcStatus::Aborted: + case Grpc::Status::WellKnownGrpcStatus::OutOfRange: + case Grpc::Status::WellKnownGrpcStatus::Unauthenticated: + break; + case Grpc::Status::WellKnownGrpcStatus::Unknown: + case Grpc::Status::WellKnownGrpcStatus::DeadlineExceeded: + case Grpc::Status::WellKnownGrpcStatus::Unimplemented: + case Grpc::Status::WellKnownGrpcStatus::ResourceExhausted: + case Grpc::Status::WellKnownGrpcStatus::Internal: + case Grpc::Status::WellKnownGrpcStatus::Unavailable: + case Grpc::Status::WellKnownGrpcStatus::DataLoss: + span.setTag(Tracing::Tags::get().Error, Tracing::Tags::get().True); + break; + } + } + } + } else { + if (grpc_status_code && grpc_status_code.value() != Grpc::Status::WellKnownGrpcStatus::Ok) { + span.setTag(Tracing::Tags::get().Error, Tracing::Tags::get().True); + } } } diff --git a/test/common/tracing/BUILD b/test/common/tracing/BUILD index 2a6621554ba17..449fb54efa33c 100644 --- a/test/common/tracing/BUILD +++ b/test/common/tracing/BUILD @@ -8,11 +8,47 @@ licenses(["notice"]) # Apache 2 envoy_package() +envoy_cc_test( + name = "http_tracer_impl_legacy_test", + srcs = [ + "http_tracer_impl_test.cc", + ], + args = [ + "--runtime-feature-disable-for-tests=envoy.reloadable_features.update_grpc_response_error_tag", + ], + coverage = True, + deps = [ + "//source/common/common:base64_lib", + "//source/common/http:header_map_lib", + "//source/common/http:headers_lib", + "//source/common/http:message_lib", + "//source/common/network:address_lib", + "//source/common/runtime:runtime_lib", + "//source/common/tracing:custom_tag_lib", + "//source/common/tracing:http_tracer_lib", + "//test/mocks/http:http_mocks", + "//test/mocks/local_info:local_info_mocks", + "//test/mocks/router:router_mocks", + "//test/mocks/runtime:runtime_mocks", + "//test/mocks/stats:stats_mocks", + "//test/mocks/thread_local:thread_local_mocks", + "//test/mocks/tracing:tracing_mocks", + "//test/test_common:environment_lib", + "//test/test_common:utility_lib", + "@envoy_api//envoy/config/core/v3:pkg_cc_proto", + "@envoy_api//envoy/type/tracing/v3:pkg_cc_proto", + ], +) + envoy_cc_test( name = "http_tracer_impl_test", srcs = [ "http_tracer_impl_test.cc", ], + args = [ + "--runtime-feature-override-for-tests=envoy.reloadable_features.update_grpc_response_error_tag", + ], + coverage = True, deps = [ "//source/common/common:base64_lib", "//source/common/http:header_map_lib", diff --git a/test/common/tracing/http_tracer_impl_test.cc b/test/common/tracing/http_tracer_impl_test.cc index 35d35edae87a0..5dcd1932f009e 100644 --- a/test/common/tracing/http_tracer_impl_test.cc +++ b/test/common/tracing/http_tracer_impl_test.cc @@ -622,8 +622,15 @@ TEST_F(HttpConnManFinalizerImplTest, GrpcErrorTag) { Http::TestResponseHeaderMapImpl response_headers{{":status", "200"}, {"content-type", "application/grpc"}}; - Http::TestResponseTrailerMapImpl response_trailers{{"grpc-status", "7"}, - {"grpc-message", "permission denied"}}; + + Http::TestResponseTrailerMapImpl response_trailers; + if (Runtime::runtimeFeatureEnabled("envoy.reloadable_features.update_grpc_response_error_tag")) { + response_trailers.setGrpcStatus("14"); + response_trailers.setGrpcMessage("unavailable"); + } else { + response_trailers.setGrpcStatus("7"); + response_trailers.setGrpcMessage("permission denied"); + } absl::optional protocol = Http::Protocol::Http2; absl::optional response_code(200); @@ -634,7 +641,6 @@ TEST_F(HttpConnManFinalizerImplTest, GrpcErrorTag) { stream_info.downstream_connection_info_provider_->setDirectRemoteAddressForTest(remote_address); EXPECT_CALL(span, setTag(_, _)).Times(testing::AnyNumber()); - EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().Error), Eq(Tracing::Tags::get().True))); EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().HttpMethod), Eq("POST"))); EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().HttpProtocol), Eq("HTTP/2"))); EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().HttpStatusCode), Eq("200"))); @@ -642,8 +648,14 @@ TEST_F(HttpConnManFinalizerImplTest, GrpcErrorTag) { EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcAuthority), Eq("example.com:80"))); EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcContentType), Eq("application/grpc"))); EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcTimeout), Eq("10s"))); - EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcStatusCode), Eq("7"))); - EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcMessage), Eq("permission denied"))); + if (Runtime::runtimeFeatureEnabled("envoy.reloadable_features.update_grpc_response_error_tag")) { + EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcStatusCode), Eq("14"))); + EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcMessage), Eq("unavailable"))); + EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().Error), Eq(Tracing::Tags::get().True))); + } else { + EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcStatusCode), Eq("7"))); + EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcMessage), Eq("permission denied"))); + } EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().PeerAddress), Eq(expected_ip))); HttpTracerUtility::finalizeDownstreamSpan(span, &request_headers, &response_headers, @@ -665,9 +677,16 @@ TEST_F(HttpConnManFinalizerImplTest, GrpcTrailersOnly) { {"te", "trailers"}}; Http::TestResponseHeaderMapImpl response_headers{{":status", "200"}, - {"content-type", "application/grpc"}, - {"grpc-status", "7"}, - {"grpc-message", "permission denied"}}; + {"content-type", "application/grpc"}}; + + if (Runtime::runtimeFeatureEnabled("envoy.reloadable_features.update_grpc_response_error_tag")) { + response_headers.setGrpcStatus("14"); + response_headers.setGrpcMessage("unavailable"); + } else { + response_headers.setGrpcStatus("7"); + response_headers.setGrpcMessage("permission denied"); + } + Http::TestResponseTrailerMapImpl response_trailers; absl::optional protocol = Http::Protocol::Http2; @@ -679,15 +698,20 @@ TEST_F(HttpConnManFinalizerImplTest, GrpcTrailersOnly) { stream_info.downstream_connection_info_provider_->setDirectRemoteAddressForTest(remote_address); EXPECT_CALL(span, setTag(_, _)).Times(testing::AnyNumber()); - EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().Error), Eq(Tracing::Tags::get().True))); EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().HttpMethod), Eq("POST"))); EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().HttpProtocol), Eq("HTTP/2"))); EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().HttpStatusCode), Eq("200"))); EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcPath), Eq("/pb.Foo/Bar"))); EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcAuthority), Eq("example.com:80"))); EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcContentType), Eq("application/grpc"))); - EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcStatusCode), Eq("7"))); - EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcMessage), Eq("permission denied"))); + if (Runtime::runtimeFeatureEnabled("envoy.reloadable_features.update_grpc_response_error_tag")) { + EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcStatusCode), Eq("14"))); + EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcMessage), Eq("unavailable"))); + EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().Error), Eq(Tracing::Tags::get().True))); + } else { + EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcStatusCode), Eq("7"))); + EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().GrpcMessage), Eq("permission denied"))); + } EXPECT_CALL(span, setTag(Eq(Tracing::Tags::get().PeerAddress), Eq(expected_ip))); HttpTracerUtility::finalizeDownstreamSpan(span, &request_headers, &response_headers, @@ -740,13 +764,12 @@ class HttpTracerImplTest : public testing::Test { Upstream::HostDescriptionConstSharedPtr shared_host(host_); stream_info_.upstreamInfo()->setUpstreamHost(shared_host); } - Http::TestRequestHeaderMapImpl request_headers_{ {":path", "/"}, {":method", "GET"}, {"x-request-id", "foo"}, {":authority", "test"}}; Http::TestResponseHeaderMapImpl response_headers_{{":status", "200"}, {"content-type", "application/grpc"}, - {"grpc-status", "7"}, - {"grpc-message", "permission denied"}}; + {"grpc-status", "14"}, + {"grpc-message", "unavailable"}}; Http::TestResponseTrailerMapImpl response_trailers_; NiceMock stream_info_; NiceMock local_info_; @@ -823,8 +846,8 @@ TEST_F(HttpTracerImplTest, ChildUpstreamSpanTest) { EXPECT_CALL(*second_span, setTag(Eq(Tracing::Tags::get().UpstreamClusterName), Eq("ob fake cluster"))); EXPECT_CALL(*second_span, setTag(Eq(Tracing::Tags::get().HttpStatusCode), Eq("200"))); - EXPECT_CALL(*second_span, setTag(Eq(Tracing::Tags::get().GrpcStatusCode), Eq("7"))); - EXPECT_CALL(*second_span, setTag(Eq(Tracing::Tags::get().GrpcMessage), Eq("permission denied"))); + EXPECT_CALL(*second_span, setTag(Eq(Tracing::Tags::get().GrpcStatusCode), Eq("14"))); + EXPECT_CALL(*second_span, setTag(Eq(Tracing::Tags::get().GrpcMessage), Eq("unavailable"))); EXPECT_CALL(*second_span, setTag(Eq(Tracing::Tags::get().Error), Eq(Tracing::Tags::get().True))); HttpTracerUtility::finalizeUpstreamSpan(*child_span, &response_headers_, &response_trailers_,