Skip to content

Commit

Permalink
Merge pull request #4232 from DataDog/ivoanjo/prof-11045-fix-profilin…
Browse files Browse the repository at this point in the history
…g-logging

[PROF-11045] Fix profiling warnings being really hard to silence
  • Loading branch information
ivoanjo authored Dec 17, 2024
2 parents be1c39a + 12ca619 commit 61ca49f
Show file tree
Hide file tree
Showing 5 changed files with 137 additions and 122 deletions.
3 changes: 2 additions & 1 deletion lib/datadog/core/configuration/components.rb
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,8 @@ def initialize(settings)
@profiler, profiler_logger_extra = Datadog::Profiling::Component.build_profiler_component(
settings: settings,
agent_settings: agent_settings,
optional_tracer: @tracer
optional_tracer: @tracer,
logger: @logger,
)
@environment_logger_extra.merge!(profiler_logger_extra) if profiler_logger_extra

Expand Down
108 changes: 55 additions & 53 deletions lib/datadog/profiling/component.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@ module Datadog
module Profiling
# Responsible for wiring up the Profiler for execution
module Component
ALLOCATION_WITH_RACTORS_ONLY_ONCE = Datadog::Core::Utils::OnlyOnce.new
private_constant :ALLOCATION_WITH_RACTORS_ONLY_ONCE

# Passing in a `nil` tracer is supported and will disable the following profiling features:
# * Code Hotspots panel in the trace viewer, as well as scoping a profile down to a span
# * Profiling in the trace viewer, as well as scoping a profile down to a span
# * Endpoint aggregation in the profiler UX, including normalization (resource per endpoint call)
def self.build_profiler_component(settings:, agent_settings:, optional_tracer:) # rubocop:disable Metrics/MethodLength
def self.build_profiler_component(settings:, agent_settings:, optional_tracer:, logger:) # rubocop:disable Metrics/MethodLength
return [nil, {profiling_enabled: false}] unless settings.profiling.enabled

# Workaround for weird dependency direction: the Core::Configuration::Components class currently has a
Expand Down Expand Up @@ -36,14 +39,14 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)

# NOTE: Please update the Initialization section of ProfilingDevelopment.md with any changes to this method

no_signals_workaround_enabled = no_signals_workaround_enabled?(settings)
no_signals_workaround_enabled = no_signals_workaround_enabled?(settings, logger)
timeline_enabled = settings.profiling.advanced.timeline_enabled
allocation_profiling_enabled = enable_allocation_profiling?(settings)
allocation_profiling_enabled = enable_allocation_profiling?(settings, logger)
heap_sample_every = get_heap_sample_every(settings)
heap_profiling_enabled = enable_heap_profiling?(settings, allocation_profiling_enabled, heap_sample_every)
heap_size_profiling_enabled = enable_heap_size_profiling?(settings, heap_profiling_enabled)
heap_profiling_enabled = enable_heap_profiling?(settings, allocation_profiling_enabled, heap_sample_every, logger)
heap_size_profiling_enabled = enable_heap_size_profiling?(settings, heap_profiling_enabled, logger)

overhead_target_percentage = valid_overhead_target(settings.profiling.advanced.overhead_target_percentage)
overhead_target_percentage = valid_overhead_target(settings.profiling.advanced.overhead_target_percentage, logger)
upload_period_seconds = [60, settings.profiling.advanced.upload_period_seconds].max

recorder = Datadog::Profiling::StackRecorder.new(
Expand All @@ -57,13 +60,13 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
)
thread_context_collector = build_thread_context_collector(settings, recorder, optional_tracer, timeline_enabled)
worker = Datadog::Profiling::Collectors::CpuAndWallTimeWorker.new(
gc_profiling_enabled: enable_gc_profiling?(settings),
gc_profiling_enabled: enable_gc_profiling?(settings, logger),
no_signals_workaround_enabled: no_signals_workaround_enabled,
thread_context_collector: thread_context_collector,
dynamic_sampling_rate_overhead_target_percentage: overhead_target_percentage,
allocation_profiling_enabled: allocation_profiling_enabled,
allocation_counting_enabled: settings.profiling.advanced.allocation_counting_enabled,
gvl_profiling_enabled: enable_gvl_profiling?(settings),
gvl_profiling_enabled: enable_gvl_profiling?(settings, logger),
)

internal_metadata = {
Expand Down Expand Up @@ -120,7 +123,7 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
)
end

private_class_method def self.enable_gc_profiling?(settings)
private_class_method def self.enable_gc_profiling?(settings, logger)
return false unless settings.profiling.advanced.gc_enabled

# SEVERE - Only with Ractors
Expand All @@ -131,14 +134,14 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
if RUBY_VERSION.start_with?("3.0.") ||
(RUBY_VERSION.start_with?("3.1.") && RUBY_VERSION < "3.1.4") ||
(RUBY_VERSION.start_with?("3.2.") && RUBY_VERSION < "3.2.3")
Datadog.logger.warn(
logger.warn(
"Current Ruby version (#{RUBY_VERSION}) has a VM bug where enabling GC profiling would cause " \
"crashes (https://bugs.ruby-lang.org/issues/18464). GC profiling has been disabled."
)
return false
elsif RUBY_VERSION.start_with?("3.")
Datadog.logger.debug(
"In all known versions of Ruby 3.x, using Ractors may result in GC profiling unexpectedly " \
logger.debug(
"Using Ractors may result in GC profiling unexpectedly " \
"stopping (https://bugs.ruby-lang.org/issues/19112). Note that this stop has no impact in your " \
"application stability or performance. This does not happen if Ractors are not used."
)
Expand All @@ -155,7 +158,7 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
heap_sample_rate
end

private_class_method def self.enable_allocation_profiling?(settings)
private_class_method def self.enable_allocation_profiling?(settings, logger)
return false unless settings.profiling.allocation_enabled

# Allocation sampling is safe and supported on Ruby 2.x, but has a few caveats on Ruby 3.x.
Expand All @@ -165,7 +168,7 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
# https://github.com/ruby/ruby/pull/7464) that makes this crash in any configuration. This bug is
# fixed on Ruby versions 3.2.3 and 3.3.0.
if RUBY_VERSION.start_with?("3.2.") && RUBY_VERSION < "3.2.3"
Datadog.logger.warn(
logger.warn(
"Allocation profiling is not supported in Ruby versions 3.2.0, 3.2.1 and 3.2.2 and will be forcibly " \
"disabled. This is due to a VM bug that can lead to crashes (https://bugs.ruby-lang.org/issues/19482). " \
"Other Ruby versions do not suffer from this issue."
Expand All @@ -181,7 +184,7 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
if RUBY_VERSION.start_with?("3.0.") ||
(RUBY_VERSION.start_with?("3.1.") && RUBY_VERSION < "3.1.4") ||
(RUBY_VERSION.start_with?("3.2.") && RUBY_VERSION < "3.2.3")
Datadog.logger.warn(
logger.warn(
"Current Ruby version (#{RUBY_VERSION}) has a VM bug where enabling allocation profiling while using " \
"Ractors may cause unexpected issues, including crashes (https://bugs.ruby-lang.org/issues/18464). " \
"This does not happen if Ractors are not used."
Expand All @@ -190,25 +193,27 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
# On all known versions of Ruby 3.x, due to https://bugs.ruby-lang.org/issues/19112, when a ractor gets
# garbage collected, Ruby will disable all active tracepoints, which this feature internally relies on.
elsif RUBY_VERSION.start_with?("3.")
Datadog.logger.warn(
"In all known versions of Ruby 3.x, using Ractors may result in allocation profiling unexpectedly " \
"stopping (https://bugs.ruby-lang.org/issues/19112). Note that this stop has no impact in your " \
"application stability or performance. This does not happen if Ractors are not used."
)
ALLOCATION_WITH_RACTORS_ONLY_ONCE.run do
logger.info(
"Using Ractors may result in allocation profiling " \
"stopping (https://bugs.ruby-lang.org/issues/19112). Note that this stop has no impact in your " \
"application stability or performance. This does not happen if Ractors are not used."
)
end
end

Datadog.logger.debug("Enabled allocation profiling")
logger.debug("Enabled allocation profiling")

true
end

private_class_method def self.enable_heap_profiling?(settings, allocation_profiling_enabled, heap_sample_rate)
private_class_method def self.enable_heap_profiling?(settings, allocation_profiling_enabled, heap_sample_rate, logger)
heap_profiling_enabled = settings.profiling.advanced.experimental_heap_enabled

return false unless heap_profiling_enabled

if RUBY_VERSION < "3.1"
Datadog.logger.warn(
logger.warn(
"Current Ruby version (#{RUBY_VERSION}) cannot support heap profiling due to VM limitations. " \
"Please upgrade to Ruby >= 3.1 in order to use this feature. Heap profiling has been disabled."
)
Expand All @@ -219,33 +224,31 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
raise ArgumentError, "Heap profiling requires allocation profiling to be enabled"
end

Datadog.logger.warn(
logger.warn(
"Enabled experimental heap profiling: heap_sample_rate=#{heap_sample_rate}. This is experimental, not " \
"recommended, and will increase overhead!"
)

true
end

private_class_method def self.enable_heap_size_profiling?(settings, heap_profiling_enabled)
private_class_method def self.enable_heap_size_profiling?(settings, heap_profiling_enabled, logger)
heap_size_profiling_enabled = settings.profiling.advanced.experimental_heap_size_enabled

return false unless heap_profiling_enabled && heap_size_profiling_enabled

Datadog.logger.warn(
logger.warn(
"Enabled experimental heap size profiling. This is experimental, not recommended, and will increase overhead!"
)

true
end

private_class_method def self.no_signals_workaround_enabled?(settings) # rubocop:disable Metrics/MethodLength
private_class_method def self.no_signals_workaround_enabled?(settings, logger) # rubocop:disable Metrics/MethodLength
setting_value = settings.profiling.advanced.no_signals_workaround_enabled
legacy_ruby_that_should_use_workaround = RUBY_VERSION.start_with?("2.5.")

unless [true, false, :auto].include?(setting_value)
# TODO: Replace with a warning instead.
Datadog.logger.error(
logger.warn(
"Ignoring invalid value for profiling no_signals_workaround_enabled setting: #{setting_value.inspect}. " \
"Valid options are `true`, `false` or (default) `:auto`."
)
Expand All @@ -254,23 +257,23 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
end

if setting_value == false
if legacy_ruby_that_should_use_workaround
Datadog.logger.warn(
'The profiling "no signals" workaround has been disabled via configuration on a legacy Ruby version ' \
"(< 2.6). This is not recommended " \
"in production environments, as due to limitations in Ruby APIs, we suspect it may lead to crashes " \
"in very rare situations. Please report any issues you run into to Datadog support or " \
if RUBY_VERSION.start_with?("2.5.")
logger.warn(
'The profiling "no signals" workaround has been disabled via configuration on Ruby 2.5. ' \
"This is not recommended " \
"in production environments, as due to limitations in Ruby APIs, we suspect it may lead to rare crashes " \
"Please report any issues you run into to Datadog support or " \
"via <https://github.com/datadog/dd-trace-rb/issues/new>!"
)
else
Datadog.logger.warn('Profiling "no signals" workaround disabled via configuration')
logger.warn('Profiling "no signals" workaround disabled via configuration')
end

return false
end

if setting_value == true
Datadog.logger.warn(
logger.warn(
'Profiling "no signals" workaround enabled via configuration. Profiling data will have lower quality.'
)

Expand All @@ -280,10 +283,10 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
# Setting is in auto mode. Let's probe to see if we should enable it:

# We don't warn users in this situation because "upgrade your Ruby" is not a great warning
return true if legacy_ruby_that_should_use_workaround
return true if RUBY_VERSION.start_with?("2.5.")

if Gem.loaded_specs["mysql2"] && incompatible_libmysqlclient_version?(settings)
Datadog.logger.warn(
if Gem.loaded_specs["mysql2"] && incompatible_libmysqlclient_version?(settings, logger)
logger.warn(
'Enabling the profiling "no signals" workaround because an incompatible version of the mysql2 gem is ' \
"installed. Profiling data will have lower quality. " \
"To fix this, upgrade the libmysqlclient in your OS image to version 8.0.0 or above."
Expand All @@ -292,7 +295,7 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
end

if Gem.loaded_specs["rugged"]
Datadog.logger.warn(
logger.warn(
'Enabling the profiling "no signals" workaround because the rugged gem is installed. ' \
"This is needed because some operations on this gem are currently incompatible with the normal working mode " \
"of the profiler, as detailed in <https://github.com/datadog/dd-trace-rb/issues/2721>. " \
Expand All @@ -302,7 +305,7 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
end

if (defined?(::PhusionPassenger) || Gem.loaded_specs["passenger"]) && incompatible_passenger_version?
Datadog.logger.warn(
logger.warn(
'Enabling the profiling "no signals" workaround because an incompatible version of the passenger gem is ' \
"installed. Profiling data will have lower quality." \
"To fix this, upgrade the passenger gem to version 6.0.19 or above."
Expand All @@ -322,10 +325,10 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
#
# The `mysql2` gem's `info` method can be used to determine which `libmysqlclient` version is in use, and thus to
# detect if it's safe for the profiler to use signals or if we need to employ a fallback.
private_class_method def self.incompatible_libmysqlclient_version?(settings)
private_class_method def self.incompatible_libmysqlclient_version?(settings, logger)
return true if settings.profiling.advanced.skip_mysql2_check

Datadog.logger.debug(
logger.debug(
"Requiring `mysql2` to check if the `libmysqlclient` version it uses is compatible with profiling"
)

Expand Down Expand Up @@ -354,14 +357,14 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
libmysqlclient_version >= Gem::Version.new("8.0.0") ||
looks_like_mariadb?(info, libmysqlclient_version)

Datadog.logger.debug(
logger.debug(
"The `mysql2` gem is using #{compatible ? "a compatible" : "an incompatible"} version of " \
"the `libmysqlclient` library (#{libmysqlclient_version})"
)

!compatible
rescue StandardError, LoadError => e
Datadog.logger.warn(
logger.warn(
"Failed to probe `mysql2` gem information. " \
"Cause: #{e.class.name} #{e.message} Location: #{Array(e.backtrace).first}"
)
Expand All @@ -383,12 +386,11 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
end
end

private_class_method def self.valid_overhead_target(overhead_target_percentage)
private_class_method def self.valid_overhead_target(overhead_target_percentage, logger)
if overhead_target_percentage > 0 && overhead_target_percentage <= 20
overhead_target_percentage
else
# TODO: Replace with a warning instead.
Datadog.logger.error(
logger.warn(
"Ignoring invalid value for profiling overhead_target_percentage setting: " \
"#{overhead_target_percentage.inspect}. Falling back to default value."
)
Expand Down Expand Up @@ -432,10 +434,10 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
settings.profiling.advanced.dir_interruption_workaround_enabled
end

private_class_method def self.enable_gvl_profiling?(settings)
private_class_method def self.enable_gvl_profiling?(settings, logger)
if RUBY_VERSION < "3.2"
if settings.profiling.advanced.preview_gvl_enabled
Datadog.logger.warn("GVL profiling is currently not supported in Ruby < 3.2 and will not be enabled.")
logger.warn("GVL profiling is currently not supported in Ruby < 3.2 and will not be enabled.")
end

return false
Expand Down
Loading

0 comments on commit 61ca49f

Please sign in to comment.