From ba30a06c10d5bf06b7f82c709398da66d76a238d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Przemys=C5=82aw=20Suliga?= <przemyslaw.suliga@canonical.com>
Date: Wed, 4 Dec 2024 11:29:23 +0100
Subject: [PATCH] Do not emit requests related prometheus metrics from the main
 process

We have been affected by deadlocks caused by talisker instrumenting the
gunicorn main process' logging with sentry and related
prometheus metrics. The deadlocks were on a lock in prometheus_client.

So this is a single threaded context in which the main process was
sometimes in the middle of incrementing a prometheus counter as a result
of sending an event to sentry as a result of gunicorn main process
logging an error log about eg. terminating an unresponsive worker. In
the middle of that when the global multiprocessing mode lock in
values.py was held, a signal handler for SIGCHLD was invoked in that
main process in response to some other worker terminating. During that
signal handling the main process also logs an error message which caused
the sentry event and a corresponding prometheus counter update ->
deadlock.

So in order to be more careful/conservative with what we do during
signal handling, or in this case what we instrument gunicorn to do, this
change sets up all of the requests related metrics to not be emitted
from the process they were created in (which for these metrics is the
gunicorn arbiter if we're running gunicorn).

More details on prometheus client behavior in
https://github.com/prometheus/client_python/pull/1076
---
 talisker/metrics.py   | 35 ++++++++++++++++++++++++++---------
 talisker/requests.py  |  3 +++
 tests/test_metrics.py | 40 ++++++++++++++++++++++++++++++++++++----
 3 files changed, 65 insertions(+), 13 deletions(-)

diff --git a/talisker/metrics.py b/talisker/metrics.py
index e3ea1d0b..26ef0428 100644
--- a/talisker/metrics.py
+++ b/talisker/metrics.py
@@ -25,6 +25,7 @@
 from contextlib import contextmanager
 import functools
 import logging
+import os
 import time
 
 import talisker.statsd
@@ -47,17 +48,31 @@
 class Metric():
     """Abstraction over prometheus and statsd metrics."""
 
-    def __init__(self, name, *args, **kwargs):
+    def __init__(self, name, *args, _only_workers_prometheus=False, **kwargs):
         self.name = name
         self.statsd_template = None
+        # If true, do not emit prometheus metrics in the gunicorn arbiter.
+        # They're not particularly useful there and cause deadlocks when it
+        # handles signals (signal handler being invoked when prometheus client
+        # is holding an internal lock in response to emitting different metric)
+        # and during signal handling logs an error, causing an http request to
+        # sentry to be tracked in prometheus.
+        self._only_workers_prometheus = _only_workers_prometheus
+        self._origin_pid = os.getpid()
 
         if statsd:
             self.statsd_template = kwargs.pop('statsd', None)
 
         if prometheus_client:
-            self.prometheus = self.get_type()(name, *args, **kwargs)
+            self._prometheus_metric = self.get_type()(name, *args, **kwargs)
         else:
-            self.prometheus = None
+            self._prometheus_metric = None
+
+    @property
+    def prometheus(self):
+        if self._only_workers_prometheus and os.getpid() == self._origin_pid:
+            return None
+        return self._prometheus_metric
 
     # prometheus_client does some odd things, if we define this as a class
     # variable it doesn't work
@@ -100,11 +115,12 @@ def metric_type(self):
 
     @protect("Failed to collect histogram metric")
     def observe(self, amount, **labels):
-        if self.prometheus:
+        prom_metric = self.prometheus
+        if prom_metric:
             if labels:
-                self.prometheus.labels(**labels).observe(amount)
+                prom_metric.labels(**labels).observe(amount)
             else:
-                self.prometheus.observe(amount)
+                prom_metric.observe(amount)
 
         if self.statsd_template:
             client = talisker.statsd.get_client()
@@ -128,11 +144,12 @@ def metric_type(self):
 
     @protect("Failed to increment counter metric")
     def inc(self, amount=1, **labels):
-        if self.prometheus:
+        prom_metric = self.prometheus
+        if prom_metric:
             if labels:
-                self.prometheus.labels(**labels).inc(amount)
+                prom_metric.labels(**labels).inc(amount)
             else:
-                self.prometheus.inc(amount)
+                prom_metric.inc(amount)
 
         if self.statsd_template:
             client = talisker.statsd.get_client()
diff --git a/talisker/requests.py b/talisker/requests.py
index 5b4901fc..f1f48e49 100644
--- a/talisker/requests.py
+++ b/talisker/requests.py
@@ -80,6 +80,7 @@ def clear():
 
 class RequestsMetric:
     latency = talisker.metrics.Histogram(
+        _only_workers_prometheus=True,
         name='requests_latency',
         documentation='Duration of http calls via requests library',
         labelnames=['host', 'view', 'status'],
@@ -89,6 +90,7 @@ class RequestsMetric:
     )
 
     count = talisker.metrics.Counter(
+        _only_workers_prometheus=True,
         name='requests_count',
         documentation='Count of http calls via requests library',
         labelnames=['host', 'view'],
@@ -96,6 +98,7 @@ class RequestsMetric:
     )
 
     errors = talisker.metrics.Counter(
+        _only_workers_prometheus=True,
         name='requests_errors',
         documentation='Count of errors in responses via requests library',
         labelnames=['host', 'type', 'view', 'status'],
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index c4d3dfe3..1960bea7 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -22,6 +22,8 @@
 # under the License.
 #
 
+import os
+
 import pytest
 
 try:
@@ -83,7 +85,7 @@ def test_histogram(context, registry):
     assert after_bucket - bucket == 1.0
 
 
-def test_histogram_protected(context, registry):
+def test_histogram_protected(monkeypatch, context, registry):
     histogram = metrics.Histogram(
         name='test_histogram_protected',
         documentation='test histogram',
@@ -92,7 +94,7 @@ def test_histogram_protected(context, registry):
         registry=registry,
     )
 
-    histogram.prometheus = 'THIS WILL RAISE'
+    monkeypatch.setattr(histogram, '_prometheus_metric', 'THIS WILL RAISE')
     histogram.observe(1.0, label='label')
     context.assert_log(msg='Failed to collect histogram metric')
 
@@ -115,7 +117,7 @@ def test_counter(context, registry):
     assert metric - count == 2
 
 
-def test_counter_protected(context, registry):
+def test_counter_protected(monkeypatch, context, registry):
     counter = metrics.Counter(
         name='test_counter_protected',
         documentation='test counter',
@@ -123,6 +125,36 @@ def test_counter_protected(context, registry):
         statsd='{name}.{label}',
     )
 
-    counter.prometheus = 'THIS WILL RAISE'
+    monkeypatch.setattr(counter, '_prometheus_metric', 'THIS WILL RAISE')
     counter.inc(1, label='label')
     context.assert_log(msg='Failed to increment counter metric')
+
+
+def test_only_workers_prometheus(monkeypatch, registry):
+    counter = metrics.Counter(
+        _only_workers_prometheus=True,
+        name='test_counter_onlyworker',
+        documentation='test counter',
+        labelnames=['label'],
+        statsd='{name}.{label}',
+        registry=registry,
+    )
+
+    labels = {'label': 'value'}
+    count = registry.get_metric('test_counter_onlyworker', **labels)
+    counter.inc(3, **labels)
+
+    metric = registry.get_metric(
+        counter_name('test_counter_onlyworker_total'), **labels
+    )
+    assert metric - count == 0
+
+    fake_worker_pid = os.getpid() + 1
+    monkeypatch.setattr(os, 'getpid', lambda: fake_worker_pid)
+
+    counter.inc(4, **labels)
+
+    metric = registry.get_metric(
+        counter_name('test_counter_onlyworker_total'), **labels
+    )
+    assert metric - count == 4