Skip to content

Commit

Permalink
Opt-in config and logic for deferral of instrumentation to only WSGI …
Browse files Browse the repository at this point in the history
…worker processes (#243)

### Issue:
OTel Python [has
issues](open-telemetry/opentelemetry-python#2767)
where the SDK is unable to report metrics for applications using a fork
process model WSGI server.
This affects ADOT when it tries to generate the OTel or Application
Signals metrics.

A solution to this is to [re-initialize the SDK in the worker processes
after the process forking as
happened](https://opentelemetry-python.readthedocs.io/en/latest/examples/fork-process-model/README.html).
A small caveat is that if the SDK has been initialized in the master
process, the worker process SDK won't work because Tracer/Meter
providers can be set globally only once. So to circumvent this, we need
to skip initializing the SDK in the master process and only do so in the
worker processes.

### Description of changes:
- Introducing an opt-in configuration environment variable
`OTEL_AWS_PYTHON_DEFER_TO_WORKERS_ENABLED` to enable if they are using a
WSGI (or a fork process model) server and want the ADOT SDK to defer
auto-instrumentation to worker processes.
- Whenever the ADOT SDK auto-instrumentation is loaded (either via the
`sitecustomize.py` file or the `opentelemetry-instrument` command), the
SDK will check if the above configuration is enabled and if the current
process is the master process, and will skip the instrumentation.
- The way we determine if the current process is master or worker is by
using an internal marker environment variable
`IS_WSGI_MASTER_PROCESS_ALREADY_SEEN`. The first time the ADOT SDK sees
a python process, this env var is not set and it will know this should
be a WSGI master process. We then set the env var and when a new worker
process forks, the master environment is copied to it (and so the env
var). So when the ADOT SDK checks this env var again (in worker) it
finds that the env var was already set to `true` in the master.

### Testing:
- Unit tests covering the functionalities bases on different
configurations of the `OTEL_AWS_PYTHON_DEFER_TO_WORKERS_ENABLED` and
`IS_WSGI_MASTER_PROCESS_ALREADY_SEEN` variables.
- Manual test using a sample application. Since this is an opt-in
configuration (a 2-way door), testing manually gives us a fair bit of
confidence.


By submitting this pull request, I confirm that you can use, modify,
copy, and redistribute this contribution, under the terms of your
choice.
  • Loading branch information
srprash authored Aug 30, 2024
1 parent 1c675d3 commit 90f7fa0
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
DEFAULT_METRIC_EXPORT_INTERVAL = 60000.0
AWS_LAMBDA_FUNCTION_NAME_CONFIG = "AWS_LAMBDA_FUNCTION_NAME"
AWS_XRAY_DAEMON_ADDRESS_CONFIG = "AWS_XRAY_DAEMON_ADDRESS"
OTEL_AWS_PYTHON_DEFER_TO_WORKERS_ENABLED_CONFIG = "OTEL_AWS_PYTHON_DEFER_TO_WORKERS_ENABLED"

_logger: Logger = getLogger(__name__)

Expand All @@ -85,6 +86,11 @@ class AwsOpenTelemetryConfigurator(_OTelSDKConfigurator):
# pylint: disable=no-self-use
@override
def _configure(self, **kwargs):
if _is_defer_to_workers_enabled() and _is_wsgi_master_process():
_logger.info(
"Skipping ADOT initialization since deferral to worker is enabled, and this is a master process."
)
return
_initialize_components()


Expand Down Expand Up @@ -156,6 +162,27 @@ def _init_tracing(
# END The OpenTelemetry Authors code


def _is_defer_to_workers_enabled():
return os.environ.get(OTEL_AWS_PYTHON_DEFER_TO_WORKERS_ENABLED_CONFIG, "false").strip().lower() == "true"


def _is_wsgi_master_process():
# Since the auto-instrumentation loads whenever a process is created and due to known issues with instrumenting
# WSGI apps using OTel, we want to skip the instrumentation of master process.
# This function is used to identify if the current process is a WSGI server's master process or not.
# Typically, a WSGI fork process model server spawns a single master process and multiple worker processes.
# When the master process starts, we use an environment variable as a marker. Since child worker processes inherit
# the master process environment, checking this marker in worker will tell that master process has been seen.
# Note: calling this function more than once in the same master process will return incorrect result.
# So use carefully.
if os.environ.get("IS_WSGI_MASTER_PROCESS_ALREADY_SEEN", "false").lower() == "true":
_logger.info("pid %s identified as a worker process", str(os.getpid()))
return False
os.environ["IS_WSGI_MASTER_PROCESS_ALREADY_SEEN"] = "true"
_logger.info("pid %s identified as a master process", str(os.getpid()))
return True


def _exclude_urls_for_instrumentations():
urls_to_exclude_instr = "SamplingTargets,GetSamplingRules"
requests_excluded_urls = os.environ.pop("OTEL_PYTHON_REQUESTS_EXCLUDED_URLS", "")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
_customize_sampler,
_customize_span_processors,
_is_application_signals_enabled,
_is_defer_to_workers_enabled,
_is_wsgi_master_process,
)
from amazon.opentelemetry.distro.aws_opentelemetry_distro import AwsOpenTelemetryDistro
from amazon.opentelemetry.distro.aws_span_metrics_processor import AwsSpanMetricsProcessor
Expand Down Expand Up @@ -305,6 +307,50 @@ def test_application_signals_exporter_provider(self):
self.assertEqual("127.0.0.1:2000", exporter._udp_exporter._endpoint)
os.environ.pop("AWS_LAMBDA_FUNCTION_NAME", None)

def test_is_defer_to_workers_enabled(self):
os.environ.setdefault("OTEL_AWS_PYTHON_DEFER_TO_WORKERS_ENABLED", "True")
self.assertTrue(_is_defer_to_workers_enabled())
os.environ.pop("OTEL_AWS_PYTHON_DEFER_TO_WORKERS_ENABLED", None)

os.environ.setdefault("OTEL_AWS_PYTHON_DEFER_TO_WORKERS_ENABLED", "False")
self.assertFalse(_is_defer_to_workers_enabled())
os.environ.pop("OTEL_AWS_PYTHON_DEFER_TO_WORKERS_ENABLED", None)
self.assertFalse(_is_defer_to_workers_enabled())

def test_is_wsgi_master_process_first_time(self):
self.assertTrue(_is_wsgi_master_process())
self.assertEqual(os.environ["IS_WSGI_MASTER_PROCESS_ALREADY_SEEN"], "true")
os.environ.pop("IS_WSGI_MASTER_PROCESS_ALREADY_SEEN", None)

@patch("amazon.opentelemetry.distro.aws_opentelemetry_configurator._initialize_components")
def test_initialize_components_skipped_in_master_when_deferred_enabled(self, mock_initialize_components):
os.environ.setdefault("OTEL_AWS_PYTHON_DEFER_TO_WORKERS_ENABLED", "True")
os.environ.pop("IS_WSGI_MASTER_PROCESS_ALREADY_SEEN", None)
self.assertTrue(_is_defer_to_workers_enabled())
AwsOpenTelemetryConfigurator()._configure()
mock_initialize_components.assert_not_called()
os.environ.pop("OTEL_AWS_PYTHON_DEFER_TO_WORKERS_ENABLED", None)
os.environ.pop("IS_WSGI_MASTER_PROCESS_ALREADY_SEEN", None)

@patch("amazon.opentelemetry.distro.aws_opentelemetry_configurator._initialize_components")
def test_initialize_components_called_in_worker_when_deferred_enabled(self, mock_initialize_components):
os.environ.setdefault("OTEL_AWS_PYTHON_DEFER_TO_WORKERS_ENABLED", "True")
os.environ.setdefault("IS_WSGI_MASTER_PROCESS_ALREADY_SEEN", "true")
self.assertTrue(_is_defer_to_workers_enabled())
self.assertFalse(_is_wsgi_master_process())
AwsOpenTelemetryConfigurator()._configure()
mock_initialize_components.assert_called_once()
os.environ.pop("OTEL_AWS_PYTHON_DEFER_TO_WORKERS_ENABLED", None)
os.environ.pop("IS_WSGI_MASTER_PROCESS_ALREADY_SEEN", None)

@patch("amazon.opentelemetry.distro.aws_opentelemetry_configurator._initialize_components")
def test_initialize_components_called_when_deferred_disabled(self, mock_initialize_components):
os.environ.pop("OTEL_AWS_PYTHON_DEFER_TO_WORKERS_ENABLED", None)
self.assertFalse(_is_defer_to_workers_enabled())
AwsOpenTelemetryConfigurator()._configure()
mock_initialize_components.assert_called_once()
os.environ.pop("IS_WSGI_MASTER_PROCESS_ALREADY_SEEN", None)


def validate_distro_environ():
tc: TestCase = TestCase()
Expand Down

0 comments on commit 90f7fa0

Please sign in to comment.