Skip to content

Commit

Permalink
Do not emit requests related prometheus metrics from the main process
Browse files Browse the repository at this point in the history
We have been affected by deadlocks caused by talisker instrumenting the
gunicorn main process' logging with sentry and related
prometheus metrics. The deadlocks were on a lock in prometheus_client.

So this is a single threaded context in which the main process was
sometimes in the middle of incrementing a prometheus counter as a result
of sending an event to sentry as a result of gunicorn main process
logging an error log about eg. terminating an unresponsive worker. In
the middle of that when the global multiprocessing mode lock in
values.py was held, a signal handler for SIGCHLD was invoked in that
main process in response to some other worker terminating. During that
signal handling the main process also logs an error message which caused
the sentry event and a corresponding prometheus counter update ->
deadlock.

So in order to be more careful/conservative with what we do during
signal handling, or in this case what we instrument gunicorn to do, this
change sets up all of the requests related metrics to not be emitted
from the process they were created in (which for these metrics is the
gunicorn arbiter if we're running gunicorn).

More details on prometheus client behavior in
prometheus/client_python#1076
  • Loading branch information
suligap committed Dec 4, 2024
1 parent 91949e2 commit ba30a06
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 13 deletions.
35 changes: 26 additions & 9 deletions talisker/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from contextlib import contextmanager
import functools
import logging
import os
import time

import talisker.statsd
Expand All @@ -47,17 +48,31 @@
class Metric():
"""Abstraction over prometheus and statsd metrics."""

def __init__(self, name, *args, **kwargs):
def __init__(self, name, *args, _only_workers_prometheus=False, **kwargs):
self.name = name
self.statsd_template = None
# If true, do not emit prometheus metrics in the gunicorn arbiter.
# They're not particularly useful there and cause deadlocks when it
# handles signals (signal handler being invoked when prometheus client
# is holding an internal lock in response to emitting different metric)
# and during signal handling logs an error, causing an http request to
# sentry to be tracked in prometheus.
self._only_workers_prometheus = _only_workers_prometheus
self._origin_pid = os.getpid()

if statsd:
self.statsd_template = kwargs.pop('statsd', None)

if prometheus_client:
self.prometheus = self.get_type()(name, *args, **kwargs)
self._prometheus_metric = self.get_type()(name, *args, **kwargs)
else:
self.prometheus = None
self._prometheus_metric = None

@property
def prometheus(self):
if self._only_workers_prometheus and os.getpid() == self._origin_pid:
return None
return self._prometheus_metric

# prometheus_client does some odd things, if we define this as a class
# variable it doesn't work
Expand Down Expand Up @@ -100,11 +115,12 @@ def metric_type(self):

@protect("Failed to collect histogram metric")
def observe(self, amount, **labels):
if self.prometheus:
prom_metric = self.prometheus
if prom_metric:
if labels:
self.prometheus.labels(**labels).observe(amount)
prom_metric.labels(**labels).observe(amount)
else:
self.prometheus.observe(amount)
prom_metric.observe(amount)

if self.statsd_template:
client = talisker.statsd.get_client()
Expand All @@ -128,11 +144,12 @@ def metric_type(self):

@protect("Failed to increment counter metric")
def inc(self, amount=1, **labels):
if self.prometheus:
prom_metric = self.prometheus
if prom_metric:
if labels:
self.prometheus.labels(**labels).inc(amount)
prom_metric.labels(**labels).inc(amount)
else:
self.prometheus.inc(amount)
prom_metric.inc(amount)

if self.statsd_template:
client = talisker.statsd.get_client()
Expand Down
3 changes: 3 additions & 0 deletions talisker/requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def clear():

class RequestsMetric:
latency = talisker.metrics.Histogram(
_only_workers_prometheus=True,
name='requests_latency',
documentation='Duration of http calls via requests library',
labelnames=['host', 'view', 'status'],
Expand All @@ -89,13 +90,15 @@ class RequestsMetric:
)

count = talisker.metrics.Counter(
_only_workers_prometheus=True,
name='requests_count',
documentation='Count of http calls via requests library',
labelnames=['host', 'view'],
statsd='{name}.{host}.{view}',
)

errors = talisker.metrics.Counter(
_only_workers_prometheus=True,
name='requests_errors',
documentation='Count of errors in responses via requests library',
labelnames=['host', 'type', 'view', 'status'],
Expand Down
40 changes: 36 additions & 4 deletions tests/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
# under the License.
#

import os

import pytest

try:
Expand Down Expand Up @@ -83,7 +85,7 @@ def test_histogram(context, registry):
assert after_bucket - bucket == 1.0


def test_histogram_protected(context, registry):
def test_histogram_protected(monkeypatch, context, registry):
histogram = metrics.Histogram(
name='test_histogram_protected',
documentation='test histogram',
Expand All @@ -92,7 +94,7 @@ def test_histogram_protected(context, registry):
registry=registry,
)

histogram.prometheus = 'THIS WILL RAISE'
monkeypatch.setattr(histogram, '_prometheus_metric', 'THIS WILL RAISE')
histogram.observe(1.0, label='label')
context.assert_log(msg='Failed to collect histogram metric')

Expand All @@ -115,14 +117,44 @@ def test_counter(context, registry):
assert metric - count == 2


def test_counter_protected(context, registry):
def test_counter_protected(monkeypatch, context, registry):
counter = metrics.Counter(
name='test_counter_protected',
documentation='test counter',
labelnames=['label'],
statsd='{name}.{label}',
)

counter.prometheus = 'THIS WILL RAISE'
monkeypatch.setattr(counter, '_prometheus_metric', 'THIS WILL RAISE')
counter.inc(1, label='label')
context.assert_log(msg='Failed to increment counter metric')


def test_only_workers_prometheus(monkeypatch, registry):
counter = metrics.Counter(
_only_workers_prometheus=True,
name='test_counter_onlyworker',
documentation='test counter',
labelnames=['label'],
statsd='{name}.{label}',
registry=registry,
)

labels = {'label': 'value'}
count = registry.get_metric('test_counter_onlyworker', **labels)
counter.inc(3, **labels)

metric = registry.get_metric(
counter_name('test_counter_onlyworker_total'), **labels
)
assert metric - count == 0

fake_worker_pid = os.getpid() + 1
monkeypatch.setattr(os, 'getpid', lambda: fake_worker_pid)

counter.inc(4, **labels)

metric = registry.get_metric(
counter_name('test_counter_onlyworker_total'), **labels
)
assert metric - count == 4

0 comments on commit ba30a06

Please sign in to comment.