Skip to content

Commit

Permalink
Merge pull request #4178 from DataDog/ivoanjo/prof-10978-drop-heap-pr…
Browse files Browse the repository at this point in the history
…ofiling-legacy-ruby

[PROF-10978] Require Ruby 3.1+ for heap profiling
  • Loading branch information
ivoanjo authored Dec 2, 2024
2 parents ca7cc9d + 39ffa38 commit 5705775
Show file tree
Hide file tree
Showing 7 changed files with 17 additions and 298 deletions.
8 changes: 0 additions & 8 deletions ext/datadog_profiling_native_extension/extconf.rb
Original file line number Diff line number Diff line change
Expand Up @@ -170,11 +170,6 @@ def skip_building_extension!(reason)
# On older Rubies, there was no jit_return member on the rb_control_frame_t struct
$defs << "-DNO_JIT_RETURN" if RUBY_VERSION < "3.1"

# On older Rubies, rb_gc_force_recycle allowed to free objects in a way that
# would be invisible to free tracepoints, finalizers and without cleaning
# obj_to_id_tbl mappings.
$defs << "-DHAVE_WORKING_RB_GC_FORCE_RECYCLE" if RUBY_VERSION < "3.1"

# On older Rubies, there are no Ractors
$defs << "-DNO_RACTORS" if RUBY_VERSION < "3"

Expand All @@ -184,9 +179,6 @@ def skip_building_extension!(reason)
# On older Rubies, objects would not move
$defs << "-DNO_T_MOVED" if RUBY_VERSION < "2.7"

# On older Rubies, there was no RUBY_SEEN_OBJ_ID flag
$defs << "-DNO_SEEN_OBJ_ID_FLAG" if RUBY_VERSION < "2.7"

# On older Rubies, rb_global_vm_lock_struct did not include the owner field
$defs << "-DNO_GVL_OWNER" if RUBY_VERSION < "2.6"

Expand Down
96 changes: 7 additions & 89 deletions ext/datadog_profiling_native_extension/heap_recorder.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,6 @@
#include "libdatadog_helpers.h"
#include "time_helpers.h"

#if (defined(HAVE_WORKING_RB_GC_FORCE_RECYCLE) && ! defined(NO_SEEN_OBJ_ID_FLAG))
#define CAN_APPLY_GC_FORCE_RECYCLE_BUG_WORKAROUND
#endif

// Minimum age (in GC generations) of heap objects we want to include in heap
// recorder iterations. Object with age 0 represent objects that have yet to undergo
// a GC and, thus, may just be noise/trash at instant of iteration and are usually not
Expand Down Expand Up @@ -123,9 +119,6 @@ typedef struct {
// Pointer to the (potentially partial) object_record containing metadata about an ongoing recording.
// When NULL, this symbolizes an unstarted/invalid recording.
object_record *object_record;
// A flag to track whether we had to force set the RUBY_FL_SEEN_OBJ_ID flag on this object
// as part of our workaround around rb_gc_force_recycle issues.
bool did_recycle_workaround;
} recording;

struct heap_recorder {
Expand Down Expand Up @@ -342,46 +335,12 @@ void start_heap_allocation_recording(heap_recorder *heap_recorder, VALUE new_obj
rb_raise(rb_eRuntimeError, "Detected a bignum object id. These are not supported by heap profiling.");
}

bool did_recycle_workaround = false;

#ifdef CAN_APPLY_GC_FORCE_RECYCLE_BUG_WORKAROUND
// If we are in a ruby version that has a working rb_gc_force_recycle implementation,
// its usage may lead to an object being re-used outside of the typical GC cycle.
//
// This re-use is in theory invisible to us unless we're lucky enough to sample both
// the original object and the replacement that uses the recycled slot.
//
// In practice, we've observed (https://github.com/DataDog/dd-trace-rb/pull/3366)
// that non-noop implementations of rb_gc_force_recycle have an implementation bug
// which results in the object that re-used the recycled slot inheriting the same
// object id without setting the FL_SEEN_OBJ_ID flag. We rely on this knowledge to
// "observe" implicit frees when an object we are tracking is force-recycled.
//
// However, it may happen that we start tracking a new object and that object was
// allocated on a recycled slot. Due to the bug, this object would be missing the
// FL_SEEN_OBJ_ID flag even though it was not recycled itself. If we left it be,
// when we're doing our liveness check, the absence of the flag would trigger our
// implicit free workaround and the object would be inferred as recycled even though
// it might still be alive.
//
// Thus, if we detect that this new allocation is already missing the flag at the start
// of the heap allocation recording, we force-set it. This should be safe since we
// just called rb_obj_id on it above and the expectation is that any flaggable object
// that goes through it ends up with the flag set (as evidenced by the GC_ASSERT
// lines in https://github.com/ruby/ruby/blob/4a8d7246d15b2054eacb20f8ab3d29d39a3e7856/gc.c#L4050C14-L4050C14).
if (RB_FL_ABLE(new_obj) && !RB_FL_TEST(new_obj, RUBY_FL_SEEN_OBJ_ID)) {
RB_FL_SET(new_obj, RUBY_FL_SEEN_OBJ_ID);
did_recycle_workaround = true;
}
#endif

heap_recorder->active_recording = (recording) {
.object_record = object_record_new(FIX2LONG(ruby_obj_id), NULL, (live_object_data) {
.weight = weight * heap_recorder->sample_rate,
.class = alloc_class != NULL ? string_from_char_slice(*alloc_class) : NULL,
.alloc_gen = rb_gc_count(),
}),
.did_recycle_workaround = did_recycle_workaround,
}),
};
}

Expand Down Expand Up @@ -685,41 +644,6 @@ static int st_object_record_update(st_data_t key, st_data_t value, st_data_t ext

// If we got this far, then we found a valid live object for the tracked id.

#ifdef CAN_APPLY_GC_FORCE_RECYCLE_BUG_WORKAROUND
// If we are in a ruby version that has a working rb_gc_force_recycle implementation,
// its usage may lead to an object being re-used outside of the typical GC cycle.
//
// This re-use is in theory invisible to us and would mean that the ref from which we
// collected the object_record metadata may not be the same as the current ref and
// thus any further reporting would be innacurately attributed to stale metadata.
//
// In practice, there is a way for us to notice that this happened because of a bug
// in the implementation of rb_gc_force_recycle. Our heap profiler relies on object
// ids and id2ref to detect whether objects are still alive. Turns out that when an
// object with an id is re-used via rb_gc_force_recycle, it will "inherit" the ID
// of the old object but it will NOT have the FL_SEEN_OBJ_ID as per the experiment
// in https://github.com/DataDog/dd-trace-rb/pull/3360#discussion_r1442823517
//
// Thus, if we detect that the ref we just resolved above is missing this flag, we can
// safely say re-use happened and thus treat it as an implicit free of the object
// we were tracking (the original one which got recycled).
if (RB_FL_ABLE(ref) && !RB_FL_TEST(ref, RUBY_FL_SEEN_OBJ_ID)) {

// NOTE: We don't really need to set this flag for heap recorder to work correctly
// but doing so partially mitigates a bug in runtimes with working rb_gc_force_recycle
// which leads to broken invariants and leaking of entries in obj_to_id and id_to_obj
// tables in objspace. We already do the same thing when we sample a recycled object,
// here we apply it as well to objects that replace recycled objects that were being
// tracked. More details in https://github.com/DataDog/dd-trace-rb/pull/3366
RB_FL_SET(ref, RUBY_FL_SEEN_OBJ_ID);

on_committed_object_record_cleanup(recorder, record);
recorder->stats_last_update.objects_dead++;
return ST_DELETE;
}

#endif

if (
recorder->size_enabled &&
recorder->update_include_old && // We only update sizes when doing a full update
Expand Down Expand Up @@ -803,18 +727,12 @@ static int update_object_record_entry(DDTRACE_UNUSED st_data_t *key, st_data_t *
object_record *new_object_record = recording.object_record;
if (existing) {
object_record *existing_record = (object_record*) (*value);
if (recording.did_recycle_workaround) {
// In this case, it's possible for an object id to be re-used and we were lucky enough to have
// sampled both the original object and the replacement so cleanup the old one and replace it with
// the new object_record (i.e. treat this as a combined free+allocation).
on_committed_object_record_cleanup(update_data->heap_recorder, existing_record);
} else {
// This is not supposed to happen, raising...
VALUE existing_inspect = object_record_inspect(existing_record);
VALUE new_inspect = object_record_inspect(new_object_record);
rb_raise(rb_eRuntimeError, "Object ids are supposed to be unique. We got 2 allocation recordings with "
"the same id. previous=%"PRIsVALUE" new=%"PRIsVALUE, existing_inspect, new_inspect);
}

// This is not supposed to happen, raising...
VALUE existing_inspect = object_record_inspect(existing_record);
VALUE new_inspect = object_record_inspect(new_object_record);
rb_raise(rb_eRuntimeError, "Object ids are supposed to be unique. We got 2 allocation recordings with "
"the same id. previous=%"PRIsVALUE" new=%"PRIsVALUE, existing_inspect, new_inspect);
}
// Always carry on with the update, we want the new record to be there at the end
(*value) = (st_data_t) new_object_record;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ bool is_current_thread_holding_the_gvl(void) {
//
// Thus an incorrect `is_current_thread_holding_the_gvl` result may lead to issues inside `rb_postponed_job_register_one`.
//
// For this reason we currently do not enable the new Ruby profiler on Ruby 2.5 by default, and we print a
// For this reason we default to use the "no signals workaround" on Ruby 2.5 by default, and we print a
// warning when customers force-enable it.
bool gvl_acquired = vm->gvl.acquired != 0;
rb_thread_t *current_owner = vm->running_thread;
Expand Down
34 changes: 0 additions & 34 deletions ext/datadog_profiling_native_extension/stack_recorder.c
Original file line number Diff line number Diff line change
Expand Up @@ -258,8 +258,6 @@ static VALUE _native_check_heap_hashes(DDTRACE_UNUSED VALUE _self, VALUE locatio
static VALUE _native_start_fake_slow_heap_serialization(DDTRACE_UNUSED VALUE _self, VALUE recorder_instance);
static VALUE _native_end_fake_slow_heap_serialization(DDTRACE_UNUSED VALUE _self, VALUE recorder_instance);
static VALUE _native_debug_heap_recorder(DDTRACE_UNUSED VALUE _self, VALUE recorder_instance);
static VALUE _native_gc_force_recycle(DDTRACE_UNUSED VALUE _self, VALUE obj);
static VALUE _native_has_seen_id_flag(DDTRACE_UNUSED VALUE _self, VALUE obj);
static VALUE _native_stats(DDTRACE_UNUSED VALUE self, VALUE instance);
static VALUE build_profile_stats(profile_slot *slot, long serialization_time_ns, long heap_iteration_prep_time_ns, long heap_profile_build_time_ns);
static VALUE _native_is_object_recorded(DDTRACE_UNUSED VALUE _self, VALUE recorder_instance, VALUE object_id);
Expand Down Expand Up @@ -297,10 +295,6 @@ void stack_recorder_init(VALUE profiling_module) {
_native_end_fake_slow_heap_serialization, 1);
rb_define_singleton_method(testing_module, "_native_debug_heap_recorder",
_native_debug_heap_recorder, 1);
rb_define_singleton_method(testing_module, "_native_gc_force_recycle",
_native_gc_force_recycle, 1);
rb_define_singleton_method(testing_module, "_native_has_seen_id_flag",
_native_has_seen_id_flag, 1);
rb_define_singleton_method(testing_module, "_native_is_object_recorded?", _native_is_object_recorded, 2);
rb_define_singleton_method(testing_module, "_native_heap_recorder_reset_last_update", _native_heap_recorder_reset_last_update, 1);
rb_define_singleton_method(testing_module, "_native_recorder_after_gc_step", _native_recorder_after_gc_step, 1);
Expand Down Expand Up @@ -1006,34 +1000,6 @@ static VALUE _native_debug_heap_recorder(DDTRACE_UNUSED VALUE _self, VALUE recor
return heap_recorder_testonly_debug(state->heap_recorder);
}

#pragma GCC diagnostic push
// rb_gc_force_recycle was deprecated in latest versions of Ruby and is a noop.
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#pragma GCC diagnostic ignored "-Wunused-parameter"
// This method exists only to enable testing Datadog::Profiling::StackRecorder behavior using RSpec.
// It SHOULD NOT be used for other purposes.
static VALUE _native_gc_force_recycle(DDTRACE_UNUSED VALUE _self, VALUE obj) {
#ifdef HAVE_WORKING_RB_GC_FORCE_RECYCLE
rb_gc_force_recycle(obj);
#endif
return Qnil;
}
#pragma GCC diagnostic pop

// This method exists only to enable testing Datadog::Profiling::StackRecorder behavior using RSpec.
// It SHOULD NOT be used for other purposes.
static VALUE _native_has_seen_id_flag(DDTRACE_UNUSED VALUE _self, VALUE obj) {
#ifndef NO_SEEN_OBJ_ID_FLAG
if (RB_FL_TEST(obj, RUBY_FL_SEEN_OBJ_ID)) {
return Qtrue;
} else {
return Qfalse;
}
#else
return Qfalse;
#endif
}

static VALUE _native_stats(DDTRACE_UNUSED VALUE self, VALUE recorder_instance) {
struct stack_recorder_state *state;
TypedData_Get_Struct(recorder_instance, struct stack_recorder_state, &stack_recorder_typed_data, state);
Expand Down
20 changes: 4 additions & 16 deletions lib/datadog/profiling/component.rb
Original file line number Diff line number Diff line change
Expand Up @@ -207,28 +207,16 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)

return false unless heap_profiling_enabled

if RUBY_VERSION.start_with?("2.") && RUBY_VERSION < "2.7"
if RUBY_VERSION < "3.1"
Datadog.logger.warn(
"Heap profiling currently relies on features introduced in Ruby 2.7 and will be forcibly disabled. " \
"Please upgrade to Ruby >= 2.7 in order to use this feature."
"Current Ruby version (#{RUBY_VERSION}) cannot support heap profiling due to VM limitations. " \
"Please upgrade to Ruby >= 3.1 in order to use this feature. Heap profiling has been disabled."
)
return false
end

if RUBY_VERSION < "3.1"
Datadog.logger.debug(
"Current Ruby version (#{RUBY_VERSION}) supports forced object recycling which has a bug that the " \
"heap profiler is forced to work around to remain accurate. This workaround requires force-setting " \
"the SEEN_OBJ_ID flag on objects that should have it but don't. Full details can be found in " \
"https://github.com/DataDog/dd-trace-rb/pull/3360. This workaround should be safe but can be " \
"bypassed by disabling the heap profiler or upgrading to Ruby >= 3.1 where forced object recycling " \
"was completely removed (https://bugs.ruby-lang.org/issues/18290)."
)
end

unless allocation_profiling_enabled
raise ArgumentError,
"Heap profiling requires allocation profiling to be enabled"
raise ArgumentError, "Heap profiling requires allocation profiling to be enabled"
end

Datadog.logger.warn(
Expand Down
25 changes: 5 additions & 20 deletions spec/datadog/profiling/component_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@
skip_if_profiling_not_supported(self)

settings.profiling.enabled = true
# Disabled to avoid warnings on Rubies where it's not supported; there's separate specs that test it when enabled
settings.profiling.advanced.gc_enabled = false
allow(profiler_setup_task).to receive(:run)
end

Expand Down Expand Up @@ -306,15 +308,15 @@
stub_const("RUBY_VERSION", testing_version)
end

context "on a Ruby older than 2.7" do
let(:testing_version) { "2.6" }
context "on a Ruby older than 3.1" do
let(:testing_version) { "3.0" }

it "initializes StackRecorder without heap sampling support and warns" do
expect(Datadog::Profiling::StackRecorder).to receive(:new)
.with(hash_including(heap_samples_enabled: false, heap_size_enabled: false))
.and_call_original

expect(Datadog.logger).to receive(:warn).with(/upgrade to Ruby >= 2.7/)
expect(Datadog.logger).to receive(:warn).with(/upgrade to Ruby >= 3.1/)

build_profiler_component
end
Expand Down Expand Up @@ -365,23 +367,6 @@
build_profiler_component
end
end

context "on a Ruby older than 3.1" do
let(:testing_version) { "2.7" }

it "initializes StackRecorder with heap sampling support but shows warning and debug messages" do
expect(Datadog::Profiling::StackRecorder).to receive(:new)
.with(hash_including(heap_samples_enabled: true))
.and_call_original

expect(Datadog.logger).to receive(:debug).with(/Enabled allocation profiling/)
expect(Datadog.logger).to receive(:warn).with(/experimental heap profiling/)
expect(Datadog.logger).to receive(:warn).with(/experimental heap size profiling/)
expect(Datadog.logger).to receive(:debug).with(/forced object recycling.+upgrading to Ruby >= 3.1/)

build_profiler_component
end
end
end
end

Expand Down
Loading

0 comments on commit 5705775

Please sign in to comment.