From ba30a06c10d5bf06b7f82c709398da66d76a238d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Przemys=C5=82aw=20Suliga?= Date: Wed, 4 Dec 2024 11:29:23 +0100 Subject: [PATCH] Do not emit requests related prometheus metrics from the main process We have been affected by deadlocks caused by talisker instrumenting the gunicorn main process' logging with sentry and related prometheus metrics. The deadlocks were on a lock in prometheus_client. So this is a single threaded context in which the main process was sometimes in the middle of incrementing a prometheus counter as a result of sending an event to sentry as a result of gunicorn main process logging an error log about eg. terminating an unresponsive worker. In the middle of that when the global multiprocessing mode lock in values.py was held, a signal handler for SIGCHLD was invoked in that main process in response to some other worker terminating. During that signal handling the main process also logs an error message which caused the sentry event and a corresponding prometheus counter update -> deadlock. So in order to be more careful/conservative with what we do during signal handling, or in this case what we instrument gunicorn to do, this change sets up all of the requests related metrics to not be emitted from the process they were created in (which for these metrics is the gunicorn arbiter if we're running gunicorn). More details on prometheus client behavior in https://github.com/prometheus/client_python/pull/1076 --- talisker/metrics.py | 35 ++++++++++++++++++++++++++--------- talisker/requests.py | 3 +++ tests/test_metrics.py | 40 ++++++++++++++++++++++++++++++++++++---- 3 files changed, 65 insertions(+), 13 deletions(-) diff --git a/talisker/metrics.py b/talisker/metrics.py index e3ea1d0b..26ef0428 100644 --- a/talisker/metrics.py +++ b/talisker/metrics.py @@ -25,6 +25,7 @@ from contextlib import contextmanager import functools import logging +import os import time import talisker.statsd @@ -47,17 +48,31 @@ class Metric(): """Abstraction over prometheus and statsd metrics.""" - def __init__(self, name, *args, **kwargs): + def __init__(self, name, *args, _only_workers_prometheus=False, **kwargs): self.name = name self.statsd_template = None + # If true, do not emit prometheus metrics in the gunicorn arbiter. + # They're not particularly useful there and cause deadlocks when it + # handles signals (signal handler being invoked when prometheus client + # is holding an internal lock in response to emitting different metric) + # and during signal handling logs an error, causing an http request to + # sentry to be tracked in prometheus. + self._only_workers_prometheus = _only_workers_prometheus + self._origin_pid = os.getpid() if statsd: self.statsd_template = kwargs.pop('statsd', None) if prometheus_client: - self.prometheus = self.get_type()(name, *args, **kwargs) + self._prometheus_metric = self.get_type()(name, *args, **kwargs) else: - self.prometheus = None + self._prometheus_metric = None + + @property + def prometheus(self): + if self._only_workers_prometheus and os.getpid() == self._origin_pid: + return None + return self._prometheus_metric # prometheus_client does some odd things, if we define this as a class # variable it doesn't work @@ -100,11 +115,12 @@ def metric_type(self): @protect("Failed to collect histogram metric") def observe(self, amount, **labels): - if self.prometheus: + prom_metric = self.prometheus + if prom_metric: if labels: - self.prometheus.labels(**labels).observe(amount) + prom_metric.labels(**labels).observe(amount) else: - self.prometheus.observe(amount) + prom_metric.observe(amount) if self.statsd_template: client = talisker.statsd.get_client() @@ -128,11 +144,12 @@ def metric_type(self): @protect("Failed to increment counter metric") def inc(self, amount=1, **labels): - if self.prometheus: + prom_metric = self.prometheus + if prom_metric: if labels: - self.prometheus.labels(**labels).inc(amount) + prom_metric.labels(**labels).inc(amount) else: - self.prometheus.inc(amount) + prom_metric.inc(amount) if self.statsd_template: client = talisker.statsd.get_client() diff --git a/talisker/requests.py b/talisker/requests.py index 5b4901fc..f1f48e49 100644 --- a/talisker/requests.py +++ b/talisker/requests.py @@ -80,6 +80,7 @@ def clear(): class RequestsMetric: latency = talisker.metrics.Histogram( + _only_workers_prometheus=True, name='requests_latency', documentation='Duration of http calls via requests library', labelnames=['host', 'view', 'status'], @@ -89,6 +90,7 @@ class RequestsMetric: ) count = talisker.metrics.Counter( + _only_workers_prometheus=True, name='requests_count', documentation='Count of http calls via requests library', labelnames=['host', 'view'], @@ -96,6 +98,7 @@ class RequestsMetric: ) errors = talisker.metrics.Counter( + _only_workers_prometheus=True, name='requests_errors', documentation='Count of errors in responses via requests library', labelnames=['host', 'type', 'view', 'status'], diff --git a/tests/test_metrics.py b/tests/test_metrics.py index c4d3dfe3..1960bea7 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -22,6 +22,8 @@ # under the License. # +import os + import pytest try: @@ -83,7 +85,7 @@ def test_histogram(context, registry): assert after_bucket - bucket == 1.0 -def test_histogram_protected(context, registry): +def test_histogram_protected(monkeypatch, context, registry): histogram = metrics.Histogram( name='test_histogram_protected', documentation='test histogram', @@ -92,7 +94,7 @@ def test_histogram_protected(context, registry): registry=registry, ) - histogram.prometheus = 'THIS WILL RAISE' + monkeypatch.setattr(histogram, '_prometheus_metric', 'THIS WILL RAISE') histogram.observe(1.0, label='label') context.assert_log(msg='Failed to collect histogram metric') @@ -115,7 +117,7 @@ def test_counter(context, registry): assert metric - count == 2 -def test_counter_protected(context, registry): +def test_counter_protected(monkeypatch, context, registry): counter = metrics.Counter( name='test_counter_protected', documentation='test counter', @@ -123,6 +125,36 @@ def test_counter_protected(context, registry): statsd='{name}.{label}', ) - counter.prometheus = 'THIS WILL RAISE' + monkeypatch.setattr(counter, '_prometheus_metric', 'THIS WILL RAISE') counter.inc(1, label='label') context.assert_log(msg='Failed to increment counter metric') + + +def test_only_workers_prometheus(monkeypatch, registry): + counter = metrics.Counter( + _only_workers_prometheus=True, + name='test_counter_onlyworker', + documentation='test counter', + labelnames=['label'], + statsd='{name}.{label}', + registry=registry, + ) + + labels = {'label': 'value'} + count = registry.get_metric('test_counter_onlyworker', **labels) + counter.inc(3, **labels) + + metric = registry.get_metric( + counter_name('test_counter_onlyworker_total'), **labels + ) + assert metric - count == 0 + + fake_worker_pid = os.getpid() + 1 + monkeypatch.setattr(os, 'getpid', lambda: fake_worker_pid) + + counter.inc(4, **labels) + + metric = registry.get_metric( + counter_name('test_counter_onlyworker_total'), **labels + ) + assert metric - count == 4