diff --git a/changelogs/current.yaml b/changelogs/current.yaml index 75d150ee9f81c..c9b688af6dff5 100644 --- a/changelogs/current.yaml +++ b/changelogs/current.yaml @@ -45,6 +45,10 @@ bug_fixes: - area: grpc change: | Fixed a bug in gRPC async client cache which intermittently causes CPU spikes due to busy loop in timer expiration. +- area: tracing + change: | + Fixed a bug that caused the Datadog tracing extension to drop traces that + should be kept on account of an extracted sampling decision. - area: quic change: | Fixed a bug in QUIC and HCM interaction which could cause use-after-free during asynchronous certificates retrieval. diff --git a/source/extensions/tracers/datadog/tracer.cc b/source/extensions/tracers/datadog/tracer.cc index cda804d1820e2..f22b097569587 100644 --- a/source/extensions/tracers/datadog/tracer.cc +++ b/source/extensions/tracers/datadog/tracer.cc @@ -20,6 +20,7 @@ #include "datadog/sampling_priority.h" #include "datadog/span_config.h" #include "datadog/trace_segment.h" +#include "datadog/tracer_config.h" namespace Envoy { namespace Extensions { @@ -94,8 +95,27 @@ Tracing::SpanPtr Tracer::startSpan(const Tracing::Config&, Tracing::TraceContext span_config.resource = operation_name; span_config.start = estimateTime(stream_info.startTime()); - datadog::tracing::Tracer& tracer = *thread_local_tracer.tracer; TraceContextReader reader{trace_context}; + datadog::tracing::Span span = + extract_or_create_span(*thread_local_tracer.tracer, span_config, reader); + + // If we did not extract a sampling decision, and if Envoy is telling us to + // drop the trace, then we treat that as a "user drop" (manual override). + // + // If Envoy is telling us to keep the trace, then we leave it up to the + // tracer's internal sampler (which might decide to drop the trace anyway). + if (!span.trace_segment().sampling_decision().has_value() && !tracing_decision.traced) { + span.trace_segment().override_sampling_priority( + int(datadog::tracing::SamplingPriority::USER_DROP)); + } + + return std::make_unique(std::move(span)); +} + +datadog::tracing::Span +Tracer::extract_or_create_span(datadog::tracing::Tracer& tracer, + const datadog::tracing::SpanConfig& span_config, + const datadog::tracing::DictReader& reader) { datadog::tracing::Expected maybe_span = tracer.extract_span(reader, span_config); if (datadog::tracing::Error* error = maybe_span.if_error()) { @@ -111,23 +131,10 @@ Tracing::SpanPtr Tracer::startSpan(const Tracing::Config&, Tracing::TraceContext int(error->code), error->message); } - maybe_span = tracer.create_span(span_config); - } - - ASSERT(maybe_span); - datadog::tracing::Span& span = *maybe_span; - - // If Envoy is telling us to drop the trace, then we treat that as a - // "user drop" (manual override). - // - // If Envoy is telling us to keep the trace, then we leave it up to the - // tracer's internal sampler (which might decide to drop the trace anyway). - if (!tracing_decision.traced) { - span.trace_segment().override_sampling_priority( - int(datadog::tracing::SamplingPriority::USER_DROP)); + return tracer.create_span(span_config); } - return std::make_unique(std::move(span)); + return std::move(*maybe_span); } } // namespace Datadog diff --git a/source/extensions/tracers/datadog/tracer.h b/source/extensions/tracers/datadog/tracer.h index 0e433760a304d..f04f7253b51e5 100644 --- a/source/extensions/tracers/datadog/tracer.h +++ b/source/extensions/tracers/datadog/tracer.h @@ -10,7 +10,18 @@ #include "source/extensions/tracers/datadog/tracer_stats.h" #include "datadog/tracer.h" -#include "datadog/tracer_config.h" + +namespace datadog { +namespace tracing { + +class DictReader; +class FinalizedTracerConfig; +class Span; +struct SpanConfig; +struct TracerConfig; + +} // namespace tracing +} // namespace datadog namespace Envoy { namespace Extensions { @@ -73,8 +84,8 @@ class Tracer : public Tracing::Driver, private Logger::Loggable thread_local_slot_; }; diff --git a/test/extensions/tracers/datadog/tracer_test.cc b/test/extensions/tracers/datadog/tracer_test.cc index 8a4e276fd3daa..7cd37a8a33291 100644 --- a/test/extensions/tracers/datadog/tracer_test.cc +++ b/test/extensions/tracers/datadog/tracer_test.cc @@ -1,3 +1,5 @@ +#include + #include "envoy/tracing/trace_reason.h" #include "source/common/tracing/null_span_impl.h" @@ -7,11 +9,14 @@ #include "test/mocks/stream_info/mocks.h" #include "test/mocks/thread_local/mocks.h" #include "test/mocks/upstream/cluster_manager.h" +#include "test/test_common/environment.h" #include "test/test_common/simulated_time_system.h" #include "test/test_common/utility.h" #include "datadog/error.h" #include "datadog/expected.h" +#include "datadog/optional.h" +#include "datadog/propagation_style.h" #include "datadog/sampling_priority.h" #include "datadog/trace_segment.h" #include "datadog/tracer_config.h" @@ -23,6 +28,30 @@ namespace Tracers { namespace Datadog { namespace { +class EnvVarGuard { +public: + EnvVarGuard(const std::string& name, const std::string& value) : name_(name) { + if (const char* const previous = std::getenv(name.c_str())) { + previous_value_ = previous; + } + const int overwrite = 1; // Yes, overwrite it. + TestEnvironment::setEnvVar(name, value, overwrite); + } + + ~EnvVarGuard() { + if (previous_value_) { + const int overwrite = 1; // Yes, overwrite it. + TestEnvironment::setEnvVar(name_, *previous_value_, overwrite); + } else { + TestEnvironment::unsetEnvVar(name_); + } + } + +private: + std::string name_; + datadog::tracing::Optional previous_value_; +}; + class DatadogTracerTest : public testing::Test { public: DatadogTracerTest() { @@ -203,6 +232,145 @@ TEST_F(DatadogTracerTest, ExtractionFailure) { ASSERT_TRUE(maybe_dd_span); } +TEST_F(DatadogTracerTest, EnvoySamplingVersusExtractedSampling) { + // Verify that sampling decisions extracted from incoming requests are honored + // regardless of the sampling decision made by Envoy (i.e. `bool + // Tracing::Decision::traced`). + // + // We test two styles of extraction: OpenTelemetry's W3C "tracecontext" style + // and Datadog's "datadog" style. When trace context is extracted in either of + // these styles, a sampling decision might be present. If a sampling decision + // is present, then the resulting sampling priority in the extracted trace + // must be the same as that which was extracted. + // + // If a sampling decision is not present in the extracted trace context, then + // an Envoy decision of "drop" is honored. An Envoy decision of "keep" + // delegates the sampling decision to the underlying Datadog tracer, which + // will not make a sampling decision immediately. + + struct Case { + int line; + datadog::tracing::Optional extracted_sampling_priority; + bool envoy_decision_keep; + datadog::tracing::PropagationStyle extraction_style; + // `resulting_sampling_priority` is the sampling priority that results from + // trace context extraction. + // It's not necessarily the sampling priority that would be sent to the + // Datadog Agent. + // If `resulting_sampling_priority` is null, then that means that the tracer + // does not make an initial sampling decision, though it will make one by + // the time is sends spans to the Datadog Agent or injects trace context + // into an outgoing request. + datadog::tracing::Optional resulting_sampling_priority; + } cases[] = { + {__LINE__, datadog::tracing::nullopt, true, datadog::tracing::PropagationStyle::DATADOG, + datadog::tracing::nullopt}, + // Note that the `resulting_sampling_priority` in this case is an artifact + // of "traceparent" always containing a sampling decision in its flags. See + // the main body of the test, below, for more information. + {__LINE__, datadog::tracing::nullopt, true, datadog::tracing::PropagationStyle::W3C, 0}, + // This is the only case, at least in this test, where Envoy's decision + // affects the resulting sampling priority. + {__LINE__, datadog::tracing::nullopt, false, datadog::tracing::PropagationStyle::DATADOG, -1}, + {__LINE__, datadog::tracing::nullopt, false, datadog::tracing::PropagationStyle::W3C, 0}, + + {__LINE__, -1, true, datadog::tracing::PropagationStyle::DATADOG, -1}, + {__LINE__, -1, true, datadog::tracing::PropagationStyle::W3C, -1}, + {__LINE__, -1, false, datadog::tracing::PropagationStyle::DATADOG, -1}, + {__LINE__, -1, false, datadog::tracing::PropagationStyle::W3C, -1}, + + {__LINE__, 0, true, datadog::tracing::PropagationStyle::DATADOG, 0}, + {__LINE__, 0, true, datadog::tracing::PropagationStyle::W3C, 0}, + {__LINE__, 0, false, datadog::tracing::PropagationStyle::DATADOG, 0}, + {__LINE__, 0, false, datadog::tracing::PropagationStyle::W3C, 0}, + + {__LINE__, 1, true, datadog::tracing::PropagationStyle::DATADOG, 1}, + {__LINE__, 1, true, datadog::tracing::PropagationStyle::W3C, 1}, + {__LINE__, 1, false, datadog::tracing::PropagationStyle::DATADOG, 1}, + {__LINE__, 1, false, datadog::tracing::PropagationStyle::W3C, 1}, + + {__LINE__, 2, true, datadog::tracing::PropagationStyle::DATADOG, 2}, + {__LINE__, 2, true, datadog::tracing::PropagationStyle::W3C, 2}, + {__LINE__, 2, false, datadog::tracing::PropagationStyle::DATADOG, 2}, + {__LINE__, 2, false, datadog::tracing::PropagationStyle::W3C, 2}, + }; + + for (const Case& test_case : cases) { + std::ostringstream failure_context; + failure_context << "Failure occurred for test entry on line " << test_case.line; + + std::string style_name; + if (test_case.extraction_style == datadog::tracing::PropagationStyle::DATADOG) { + style_name = "datadog"; + } else { + ASSERT_EQ(test_case.extraction_style, datadog::tracing::PropagationStyle::W3C) + << failure_context.str(); + style_name = "tracecontext"; + } + + EnvVarGuard guard{"DD_TRACE_PROPAGATION_STYLE", style_name}; + datadog::tracing::TracerConfig config; + config.defaults.service = "envoy"; + Tracer tracer("fake_cluster", "test_host", config, cluster_manager_, *store_.rootScope(), + thread_local_slot_allocator_); + + Tracing::Decision envoy_decision; + envoy_decision.reason = Tracing::Reason::Sampling; + envoy_decision.traced = test_case.envoy_decision_keep; + + const std::string operation_name = "do.thing"; + + Tracing::TestTraceContextImpl context{{}}; + if (test_case.extraction_style == datadog::tracing::PropagationStyle::DATADOG) { + context.context_map_["x-datadog-trace-id"] = "123"; + context.context_map_["x-datadog-parent-id"] = "456"; + if (test_case.extracted_sampling_priority) { + context.context_map_["x-datadog-sampling-priority"] = + std::to_string(*test_case.extracted_sampling_priority); + } + } else { + ASSERT_EQ(test_case.extraction_style, datadog::tracing::PropagationStyle::W3C) + << failure_context.str(); + std::string flags; + if (test_case.extracted_sampling_priority) { + const int priority = *test_case.extracted_sampling_priority; + flags = priority <= 0 ? "00" : "01"; + context.context_map_["tracestate"] = "dd=s:" + std::to_string(priority); + } else { + // There's no such thing as the absence of a sampling decision with + // "traceparent," so default to "drop." + flags = "00"; + } + context.context_map_["traceparent"] = + "00-0000000000000000000000000000007b-00000000000001c8-" + flags; + } + + const Tracing::SpanPtr span = tracer.startSpan(Tracing::MockConfig{}, context, stream_info_, + operation_name, envoy_decision); + ASSERT_TRUE(span) << failure_context.str(); + const auto as_dd_span_wrapper = dynamic_cast(span.get()); + EXPECT_NE(nullptr, as_dd_span_wrapper) << failure_context.str(); + + const datadog::tracing::Optional& maybe_dd_span = + as_dd_span_wrapper->impl(); + ASSERT_TRUE(maybe_dd_span) << failure_context.str(); + const datadog::tracing::Span& dd_span = *maybe_dd_span; + + const datadog::tracing::Optional decision = + dd_span.trace_segment().sampling_decision(); + if (test_case.resulting_sampling_priority) { + // We expect that the tracer made a sampling decision immediately, and + // that it has the expected sampling priority. + ASSERT_NE(datadog::tracing::nullopt, decision) << failure_context.str(); + EXPECT_EQ(*test_case.resulting_sampling_priority, decision->priority) + << failure_context.str(); + } else { + // We expect that the tracer did not immediately make a sampling decision. + EXPECT_EQ(datadog::tracing::nullopt, decision) << failure_context.str(); + } + } +} + } // namespace } // namespace Datadog } // namespace Tracers