Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PROF-11003] Fix allocation profiling + otel tracing causing Ruby crash #4240

Merged
merged 7 commits into from
Jan 3, 2025
Prev Previous commit
Next Next commit
Fix going into Ruby code when looking up otel context
`rb_hash_lookup` calls `#hash` on the key being looked up so it's safe
to use unless during sampling.

This can cause the same issue as we saw in #4195 leading to

```
[BUG] unexpected situation - recordd:1 current:0

-- C level backtrace information -------------------------------------------
ruby(rb_print_backtrace+0x11) [0x55ba03ccf90f] vm_dump.c:820
ruby(rb_vm_bugreport) vm_dump.c:1151
ruby(bug_report_end+0x0) [0x55ba03e91607] error.c:1042
ruby(rb_bug_without_die) error.c:1042
ruby(die+0x0) [0x55ba03ac0998] error.c:1050
ruby(rb_bug) error.c:1052
ruby(disallow_reentry+0x0) [0x55ba03ab6dcc] vm_sync.c:226
ruby(rb_ec_vm_lock_rec_check+0x1a) [0x55ba03cb17aa] eval_intern.h:144
ruby(rb_ec_tag_state) eval_intern.h:155
ruby(rb_vm_exec) vm.c:2484
ruby(vm_invoke_proc+0x201) [0x55ba03cb62b1] vm.c:1509
ruby(rb_vm_invoke_proc+0x33) [0x55ba03cb65d3] vm.c:1728
ruby(thread_do_start_proc+0x176) [0x55ba03c63516] thread.c:598
ruby(thread_do_start+0x12) [0x55ba03c648a2] thread.c:615
ruby(thread_start_func_2) thread.c:672
ruby(nt_start+0x107) [0x55ba03c65137] thread_pthread.c:2187
/lib/x86_64-linux-gnu/libpthread.so.0(start_thread+0xd9) [0x7ff360b66609]
/lib/x86_64-linux-gnu/libc.so.6(clone+0x43) [0x7ff360a70353]
```
  • Loading branch information
ivoanjo committed Dec 19, 2024
commit 1c26fd855211760e26278406b44d06cef76fb646
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,7 @@ static void otel_without_ddtrace_trace_identifiers_for(
);
static struct otel_span otel_span_from(VALUE otel_context, VALUE otel_current_span_key);
static uint64_t otel_span_id_to_uint(VALUE otel_span_id);
static VALUE safely_lookup_hash_without_going_into_ruby_code(VALUE hash, VALUE key);

void collectors_thread_context_init(VALUE profiling_module) {
VALUE collectors_module = rb_define_module_under(profiling_module, "Collectors");
Expand Down Expand Up @@ -1632,7 +1633,7 @@ static void ddtrace_otel_trace_identifiers_for(
// trace and span representing it. Each ddtrace trace is then connected to the previous otel span, forming a linked
// list. The local root span is going to be the trace/span we find at the end of this linked list.
while (otel_values != Qnil) {
VALUE otel_span = rb_hash_lookup(otel_values, otel_current_span_key);
VALUE otel_span = safely_lookup_hash_without_going_into_ruby_code(otel_values, otel_current_span_key);
if (otel_span == Qnil) break;
VALUE next_trace = rb_ivar_get(otel_span, at_datadog_trace_id);
if (next_trace == Qnil) break;
Expand Down Expand Up @@ -1766,7 +1767,7 @@ static struct otel_span otel_span_from(VALUE otel_context, VALUE otel_current_sp
if (context_entries == Qnil || !RB_TYPE_P(context_entries, T_HASH)) return failed;

// If it exists, context_entries is expected to be a Hash[OpenTelemetry::Context::Key, OpenTelemetry::Trace::Span]
VALUE span = rb_hash_lookup(context_entries, otel_current_span_key);
VALUE span = safely_lookup_hash_without_going_into_ruby_code(context_entries, otel_current_span_key);
if (span == Qnil) return failed;

// If it exists, span_context is expected to be a OpenTelemetry::Trace::SpanContext (don't confuse it with OpenTelemetry::Context)
Expand Down Expand Up @@ -2092,3 +2093,37 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) {
DDTRACE_UNUSED long current_cpu_time_ns
) { return false; }
#endif // NO_GVL_INSTRUMENTATION

#define MAX_SAFE_LOOKUP_SIZE 16

typedef struct { VALUE lookup_key; VALUE result; } safe_lookup_hash_state;

static int safe_lookup_hash_iterate(VALUE key, VALUE value, VALUE state_ptr) {
safe_lookup_hash_state *state = (safe_lookup_hash_state *) state_ptr;

if (key == state->lookup_key) {
state->result = value;
return ST_STOP;
}

return ST_CONTINUE;
}

// This method exists because we need to look up a hash during sampling, but we don't want to invoke any
// Ruby code as a side effect. To be able to look up by hash, `rb_hash_lookup` calls `#hash` on the key,
// which we want to avoid.
// Thus, instead, we opt to just iterate through the hash and check if we can find what we're looking for.
//
// To avoid having too much overhead here we only iterate in hashes up to MAX_SAFE_LOOKUP_SIZE.
// Note that we don't even try to iterate if the hash is bigger -- this is to avoid flaky behavior where
// depending on the internal storage order of the hash we may or not find the key, and instead we always
// enforce the size.
static VALUE safely_lookup_hash_without_going_into_ruby_code(VALUE hash, VALUE key) {
if (!RB_TYPE_P(hash, T_HASH) || RHASH_SIZE(hash) > MAX_SAFE_LOOKUP_SIZE) return Qnil;

safe_lookup_hash_state state = {.lookup_key = key, .result = Qnil};

rb_hash_foreach(hash, safe_lookup_hash_iterate, (VALUE) &state);

return state.result;
}
84 changes: 83 additions & 1 deletion spec/datadog/profiling/collectors/thread_context_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,7 @@ def self.otel_otlp_exporter_available?
false
end

# When opentelemetry-sdk is on the Gemfile, but not opentelemetry-exporter-otlp
context "when trace comes from otel sdk", if: otel_sdk_available? && !otel_otlp_exporter_available? do
let(:otel_tracer) do
require "datadog/opentelemetry"
Expand Down Expand Up @@ -622,6 +623,31 @@ def self.otel_otlp_exporter_available?
expect(t1_sample.labels).to_not include("trace endpoint": anything)
end

describe 'accessing the current span' do
before do
allow(Datadog.logger).to receive(:error)

# initialize otel context reading
sample
# clear samples
recorder.serialize!
end

it 'does not try to hash the CURRENT_SPAN_KEY' do
inner_check_ran = false

otel_tracer.in_span("profiler.test") do |_span|
expect(OpenTelemetry::Trace.const_get(:CURRENT_SPAN_KEY)).to_not receive(:hash)

sample_allocation(weight: 1)

inner_check_ran = true
end

expect(inner_check_ran).to be true
end
end

context "when there are multiple otel spans nested" do
let(:t1) do
Thread.new(ready_queue, otel_tracer) do |ready_queue, otel_tracer|
Expand Down Expand Up @@ -721,6 +747,7 @@ def self.otel_otlp_exporter_available?
end
end

# When opentelemetry-sdk AND opentelemetry-exporter-otlp are on the Gemfile
context(
"when trace comes from otel sdk and the ddtrace otel support is not loaded",
if: otel_sdk_available? && otel_otlp_exporter_available?
Expand Down Expand Up @@ -769,7 +796,7 @@ def otel_span_id_to_i(span_id)
expect(t1_sample.labels).to_not include("trace endpoint": anything)
end

context 'reading CURRENT_SPAN_KEY' do
describe 'reading CURRENT_SPAN_KEY into otel_current_span_key' do
let!(:ran_log) { [] }

let(:setup_failure) do
Expand Down Expand Up @@ -818,6 +845,61 @@ def otel_span_id_to_i(span_id)
end
end

describe 'accessing the current span' do
before do
allow(OpenTelemetry.logger).to receive(:error)

# initialize otel context reading
sample
# clear samples
recorder.serialize!
end

it 'does not try to hash the CURRENT_SPAN_KEY' do
inner_check_ran = false

otel_tracer.in_span("profiler.test") do |_span|
expect(OpenTelemetry::Trace.const_get(:CURRENT_SPAN_KEY)).to_not receive(:hash)

sample_allocation(weight: 1)

inner_check_ran = true
end

expect(inner_check_ran).to be true
end

context 'when there are more than MAX_SAFE_LOOKUP_SIZE entries in the otel context' do
let(:max_safe_lookup_size) { 16 } # Value of MAX_SAFE_LOOKUP_SIZE in C code

it 'does not try to look up the context' do
otel_tracer.in_span("profiler.test") do |_span|
current_size = OpenTelemetry::Context.current.instance_variable_get(:@entries).size

OpenTelemetry::Context.with_values(
Array.new((max_safe_lookup_size + 1 - current_size)) { |it| ["key_#{it}", it] }.to_h
) do
sample_allocation(weight: 12)
end

OpenTelemetry::Context.with_values(
Array.new((max_safe_lookup_size - current_size)) { |it| ["key_#{it}", it] }.to_h
) do
sample_allocation(weight: 34)
end
end

result = samples_for_thread(samples, Thread.current)

expect(result.size).to be 2
expect(result.find { |it| it.values.fetch(:"alloc-samples") == 12 }.labels.keys)
.to_not include(:"local root span id", :"span id")
expect(result.find { |it| it.values.fetch(:"alloc-samples") == 34 }.labels.keys)
.to include(:"local root span id", :"span id")
end
end
end

context 'when otel_context_enabled is false' do
let(:otel_context_enabled) { false }

Expand Down