Skip to content

Commit 1c26fd8

Browse files
committed
Fix going into Ruby code when looking up otel context
`rb_hash_lookup` calls `#hash` on the key being looked up so it's safe to use unless during sampling. This can cause the same issue as we saw in #4195 leading to ``` [BUG] unexpected situation - recordd:1 current:0 -- C level backtrace information ------------------------------------------- ruby(rb_print_backtrace+0x11) [0x55ba03ccf90f] vm_dump.c:820 ruby(rb_vm_bugreport) vm_dump.c:1151 ruby(bug_report_end+0x0) [0x55ba03e91607] error.c:1042 ruby(rb_bug_without_die) error.c:1042 ruby(die+0x0) [0x55ba03ac0998] error.c:1050 ruby(rb_bug) error.c:1052 ruby(disallow_reentry+0x0) [0x55ba03ab6dcc] vm_sync.c:226 ruby(rb_ec_vm_lock_rec_check+0x1a) [0x55ba03cb17aa] eval_intern.h:144 ruby(rb_ec_tag_state) eval_intern.h:155 ruby(rb_vm_exec) vm.c:2484 ruby(vm_invoke_proc+0x201) [0x55ba03cb62b1] vm.c:1509 ruby(rb_vm_invoke_proc+0x33) [0x55ba03cb65d3] vm.c:1728 ruby(thread_do_start_proc+0x176) [0x55ba03c63516] thread.c:598 ruby(thread_do_start+0x12) [0x55ba03c648a2] thread.c:615 ruby(thread_start_func_2) thread.c:672 ruby(nt_start+0x107) [0x55ba03c65137] thread_pthread.c:2187 /lib/x86_64-linux-gnu/libpthread.so.0(start_thread+0xd9) [0x7ff360b66609] /lib/x86_64-linux-gnu/libc.so.6(clone+0x43) [0x7ff360a70353] ```
1 parent 1482f00 commit 1c26fd8

File tree

2 files changed

+120
-3
lines changed

2 files changed

+120
-3
lines changed

ext/datadog_profiling_native_extension/collectors_thread_context.c

+37-2
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,7 @@ static void otel_without_ddtrace_trace_identifiers_for(
291291
);
292292
static struct otel_span otel_span_from(VALUE otel_context, VALUE otel_current_span_key);
293293
static uint64_t otel_span_id_to_uint(VALUE otel_span_id);
294+
static VALUE safely_lookup_hash_without_going_into_ruby_code(VALUE hash, VALUE key);
294295

295296
void collectors_thread_context_init(VALUE profiling_module) {
296297
VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors");
@@ -1632,7 +1633,7 @@ static void ddtrace_otel_trace_identifiers_for(
16321633
// trace and span representing it. Each ddtrace trace is then connected to the previous otel span, forming a linked
16331634
// list. The local root span is going to be the trace/span we find at the end of this linked list.
16341635
while (otel_values != Qnil) {
1635-
VALUE otel_span = rb_hash_lookup(otel_values, otel_current_span_key);
1636+
VALUE otel_span = safely_lookup_hash_without_going_into_ruby_code(otel_values, otel_current_span_key);
16361637
if (otel_span == Qnil) break;
16371638
VALUE next_trace = rb_ivar_get(otel_span, at_datadog_trace_id);
16381639
if (next_trace == Qnil) break;
@@ -1766,7 +1767,7 @@ static struct otel_span otel_span_from(VALUE otel_context, VALUE otel_current_sp
17661767
if (context_entries == Qnil || !RB_TYPE_P(context_entries, T_HASH)) return failed;
17671768

17681769
// If it exists, context_entries is expected to be a Hash[OpenTelemetry::Context::Key, OpenTelemetry::Trace::Span]
1769-
VALUE span = rb_hash_lookup(context_entries, otel_current_span_key);
1770+
VALUE span = safely_lookup_hash_without_going_into_ruby_code(context_entries, otel_current_span_key);
17701771
if (span == Qnil) return failed;
17711772

17721773
// If it exists, span_context is expected to be a OpenTelemetry::Trace::SpanContext (don't confuse it with OpenTelemetry::Context)
@@ -2092,3 +2093,37 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) {
20922093
DDTRACE_UNUSED long current_cpu_time_ns
20932094
) { return false; }
20942095
#endif // NO_GVL_INSTRUMENTATION
2096+
2097+
#define MAX_SAFE_LOOKUP_SIZE 16
2098+
2099+
typedef struct { VALUE lookup_key; VALUE result; } safe_lookup_hash_state;
2100+
2101+
static int safe_lookup_hash_iterate(VALUE key, VALUE value, VALUE state_ptr) {
2102+
safe_lookup_hash_state *state = (safe_lookup_hash_state *) state_ptr;
2103+
2104+
if (key == state->lookup_key) {
2105+
state->result = value;
2106+
return ST_STOP;
2107+
}
2108+
2109+
return ST_CONTINUE;
2110+
}
2111+
2112+
// This method exists because we need to look up a hash during sampling, but we don't want to invoke any
2113+
// Ruby code as a side effect. To be able to look up by hash, `rb_hash_lookup` calls `#hash` on the key,
2114+
// which we want to avoid.
2115+
// Thus, instead, we opt to just iterate through the hash and check if we can find what we're looking for.
2116+
//
2117+
// To avoid having too much overhead here we only iterate in hashes up to MAX_SAFE_LOOKUP_SIZE.
2118+
// Note that we don't even try to iterate if the hash is bigger -- this is to avoid flaky behavior where
2119+
// depending on the internal storage order of the hash we may or not find the key, and instead we always
2120+
// enforce the size.
2121+
static VALUE safely_lookup_hash_without_going_into_ruby_code(VALUE hash, VALUE key) {
2122+
if (!RB_TYPE_P(hash, T_HASH) || RHASH_SIZE(hash) > MAX_SAFE_LOOKUP_SIZE) return Qnil;
2123+
2124+
safe_lookup_hash_state state = {.lookup_key = key, .result = Qnil};
2125+
2126+
rb_hash_foreach(hash, safe_lookup_hash_iterate, (VALUE) &state);
2127+
2128+
return state.result;
2129+
}

spec/datadog/profiling/collectors/thread_context_spec.rb

+83-1
Original file line numberDiff line numberDiff line change
@@ -588,6 +588,7 @@ def self.otel_otlp_exporter_available?
588588
false
589589
end
590590

591+
# When opentelemetry-sdk is on the Gemfile, but not opentelemetry-exporter-otlp
591592
context "when trace comes from otel sdk", if: otel_sdk_available? && !otel_otlp_exporter_available? do
592593
let(:otel_tracer) do
593594
require "datadog/opentelemetry"
@@ -622,6 +623,31 @@ def self.otel_otlp_exporter_available?
622623
expect(t1_sample.labels).to_not include("trace endpoint": anything)
623624
end
624625

626+
describe 'accessing the current span' do
627+
before do
628+
allow(Datadog.logger).to receive(:error)
629+
630+
# initialize otel context reading
631+
sample
632+
# clear samples
633+
recorder.serialize!
634+
end
635+
636+
it 'does not try to hash the CURRENT_SPAN_KEY' do
637+
inner_check_ran = false
638+
639+
otel_tracer.in_span("profiler.test") do |_span|
640+
expect(OpenTelemetry::Trace.const_get(:CURRENT_SPAN_KEY)).to_not receive(:hash)
641+
642+
sample_allocation(weight: 1)
643+
644+
inner_check_ran = true
645+
end
646+
647+
expect(inner_check_ran).to be true
648+
end
649+
end
650+
625651
context "when there are multiple otel spans nested" do
626652
let(:t1) do
627653
Thread.new(ready_queue, otel_tracer) do |ready_queue, otel_tracer|
@@ -721,6 +747,7 @@ def self.otel_otlp_exporter_available?
721747
end
722748
end
723749

750+
# When opentelemetry-sdk AND opentelemetry-exporter-otlp are on the Gemfile
724751
context(
725752
"when trace comes from otel sdk and the ddtrace otel support is not loaded",
726753
if: otel_sdk_available? && otel_otlp_exporter_available?
@@ -769,7 +796,7 @@ def otel_span_id_to_i(span_id)
769796
expect(t1_sample.labels).to_not include("trace endpoint": anything)
770797
end
771798

772-
context 'reading CURRENT_SPAN_KEY' do
799+
describe 'reading CURRENT_SPAN_KEY into otel_current_span_key' do
773800
let!(:ran_log) { [] }
774801

775802
let(:setup_failure) do
@@ -818,6 +845,61 @@ def otel_span_id_to_i(span_id)
818845
end
819846
end
820847

848+
describe 'accessing the current span' do
849+
before do
850+
allow(OpenTelemetry.logger).to receive(:error)
851+
852+
# initialize otel context reading
853+
sample
854+
# clear samples
855+
recorder.serialize!
856+
end
857+
858+
it 'does not try to hash the CURRENT_SPAN_KEY' do
859+
inner_check_ran = false
860+
861+
otel_tracer.in_span("profiler.test") do |_span|
862+
expect(OpenTelemetry::Trace.const_get(:CURRENT_SPAN_KEY)).to_not receive(:hash)
863+
864+
sample_allocation(weight: 1)
865+
866+
inner_check_ran = true
867+
end
868+
869+
expect(inner_check_ran).to be true
870+
end
871+
872+
context 'when there are more than MAX_SAFE_LOOKUP_SIZE entries in the otel context' do
873+
let(:max_safe_lookup_size) { 16 } # Value of MAX_SAFE_LOOKUP_SIZE in C code
874+
875+
it 'does not try to look up the context' do
876+
otel_tracer.in_span("profiler.test") do |_span|
877+
current_size = OpenTelemetry::Context.current.instance_variable_get(:@entries).size
878+
879+
OpenTelemetry::Context.with_values(
880+
Array.new((max_safe_lookup_size + 1 - current_size)) { |it| ["key_#{it}", it] }.to_h
881+
) do
882+
sample_allocation(weight: 12)
883+
end
884+
885+
OpenTelemetry::Context.with_values(
886+
Array.new((max_safe_lookup_size - current_size)) { |it| ["key_#{it}", it] }.to_h
887+
) do
888+
sample_allocation(weight: 34)
889+
end
890+
end
891+
892+
result = samples_for_thread(samples, Thread.current)
893+
894+
expect(result.size).to be 2
895+
expect(result.find { |it| it.values.fetch(:"alloc-samples") == 12 }.labels.keys)
896+
.to_not include(:"local root span id", :"span id")
897+
expect(result.find { |it| it.values.fetch(:"alloc-samples") == 34 }.labels.keys)
898+
.to include(:"local root span id", :"span id")
899+
end
900+
end
901+
end
902+
821903
context 'when otel_context_enabled is false' do
822904
let(:otel_context_enabled) { false }
823905

0 commit comments

Comments
 (0)