From 0f6c43529c5b8f02f8dc87f4d31ad5701180ae17 Mon Sep 17 00:00:00 2001 From: Alan Christie Date: Wed, 22 May 2024 10:15:16 +0100 Subject: [PATCH 1/9] feat: Re-attempt to add custom metrics --- api/prometheus_metrics.py | 45 +++++++++++++++++++++++++++++++++++ api/remote_ispyb_connector.py | 4 ++++ api/security.py | 5 ++++ fragalysis/settings.py | 12 +++++++++- 4 files changed, 65 insertions(+), 1 deletion(-) create mode 100644 api/prometheus_metrics.py diff --git a/api/prometheus_metrics.py b/api/prometheus_metrics.py new file mode 100644 index 00000000..d05a1d58 --- /dev/null +++ b/api/prometheus_metrics.py @@ -0,0 +1,45 @@ +"""Prometheus metrics used by the fragalysis API module. +""" +from django.conf import settings +from prometheus_client import Counter + +_NAMESPACE: str = settings.OUR_KUBERNETES_NAMESPACE or 'unknown' + + +class PrometheusMetrics: + ssh_tunnels = Counter( + 'fragalysis_ssh_tunnels', + 'Total number of SSH tunnels created', + labelnames=['k8s_namespace'], + ) + ssh_tunnel_failures = Counter( + 'fragalysis_ssh_tunnel_failures', + 'Total number of SSH tunnel failures', + labelnames=['k8s_namespace'], + ) + ispyb_connections = Counter( + 'fragalysis_ispyb_connections', + 'Total number of ISpyB connections', + labelnames=['k8s_namespace'], + ) + ispyb_connection_failures = Counter( + 'fragalysis_ispyb_connection_failures', + 'Total number of ISpyB connection failures', + labelnames=['k8s_namespace'], + ) + + @staticmethod + def new_tunnel(): + PrometheusMetrics.ssh_tunnels.labels(_NAMESPACE).inc() + + @staticmethod + def failed_tunnel(): + PrometheusMetrics.ssh_tunnel_failures.labels(_NAMESPACE).inc() + + @staticmethod + def new_ispyb_connection(): + PrometheusMetrics.ispyb_connections.labels(_NAMESPACE).inc() + + @staticmethod + def failed_ispyb_connection(): + PrometheusMetrics.ispyb_connection_failures.labels(_NAMESPACE).inc() diff --git a/api/remote_ispyb_connector.py b/api/remote_ispyb_connector.py index c27bb5a8..f28f2e20 100644 --- a/api/remote_ispyb_connector.py +++ b/api/remote_ispyb_connector.py @@ -13,6 +13,8 @@ ) from pymysql.err import OperationalError +from .prometheus_metrics import PrometheusMetrics + logger: logging.Logger = logging.getLogger(__name__) # Timeout to allow the pymysql.connect() method to connect to the DB. @@ -181,10 +183,12 @@ def remote_connect( if self.conn is not None: if connect_attempts > 0: logger.info('Connected') + PrometheusMetrics.new_ispyb_connection() self.conn.autocommit = True else: if connect_attempts > 0: logger.info('Failed to connect') + PrometheusMetrics.failed_ispyb_connection() self.server.stop() raise ISPyBConnectionException self.last_activity_ts = time.time() diff --git a/api/security.py b/api/security.py index 8ecf6976..40677ee9 100644 --- a/api/security.py +++ b/api/security.py @@ -17,6 +17,7 @@ from viewer.models import Project +from .prometheus_metrics import PrometheusMetrics from .remote_ispyb_connector import SSHConnector logger: logging.Logger = logging.getLogger(__name__) @@ -105,8 +106,10 @@ def get_remote_conn(force_error_display=False) -> Optional[SSHConnector]: logger.exception("Got the following exception creating Connector...") if conn: logger.debug("Got remote connector") + PrometheusMetrics.new_tunnel() else: logger.debug("Failed to get a remote connector") + PrometheusMetrics.failed_tunnel() return conn @@ -140,8 +143,10 @@ def get_conn(force_error_display=False) -> Optional[Connector]: logger.exception("Got the following exception creating Connector...") if conn: logger.debug("Got connector") + PrometheusMetrics.new_ispyb_connection() else: logger.debug("Did not get a connector") + PrometheusMetrics.failed_ispyb_connection() return conn diff --git a/fragalysis/settings.py b/fragalysis/settings.py index ef3775f1..427a8827 100644 --- a/fragalysis/settings.py +++ b/fragalysis/settings.py @@ -67,7 +67,7 @@ import os import sys from datetime import timedelta -from typing import List +from typing import List, Optional import sentry_sdk from sentry_sdk.integrations.celery import CeleryIntegration @@ -535,6 +535,16 @@ NEO4J_QUERY: str = os.environ.get("NEO4J_QUERY", "neo4j") NEO4J_AUTH: str = os.environ.get("NEO4J_AUTH", "neo4j/neo4j") +# Does it look like we're running in Kubernetes? +# If so, let's get the namespace we're in - it will provide +# useful discrimination material in log/metrics messages. +# If there is no apparent namespace the variable will be 'None'. +OUR_KUBERNETES_NAMESPACE: Optional[str] = None +_NS_FILENAME: str = '/var/run/secrets/kubernetes.io/serviceaccount/namespace' +if os.path.isfile(_NS_FILENAME): + with open(_NS_FILENAME, 'rt', encoding='utf8') as ns_file: + OUR_KUBERNETES_NAMESPACE = ns_file.read().strip() + # These flags are used in the upload_tset form as follows. # Proposal Supported | Proposal Required | Proposal / View fields # Y | Y | Shown / Required From a99259890ac15e7064984a520d44a97bc7123fe3 Mon Sep 17 00:00:00 2001 From: Alan Christie Date: Wed, 22 May 2024 10:38:05 +0100 Subject: [PATCH 2/9] feat: New metric (ISpyB connection attempts) --- api/prometheus_metrics.py | 9 +++++++++ api/remote_ispyb_connector.py | 1 + 2 files changed, 10 insertions(+) diff --git a/api/prometheus_metrics.py b/api/prometheus_metrics.py index d05a1d58..0dd2deff 100644 --- a/api/prometheus_metrics.py +++ b/api/prometheus_metrics.py @@ -22,6 +22,11 @@ class PrometheusMetrics: 'Total number of ISpyB connections', labelnames=['k8s_namespace'], ) + ispyb_connection_attempts = Counter( + 'fragalysis_ispyb_connection_attempts', + 'Total number of ISpyB connection attempts (after initial failure)', + labelnames=['k8s_namespace'], + ) ispyb_connection_failures = Counter( 'fragalysis_ispyb_connection_failures', 'Total number of ISpyB connection failures', @@ -40,6 +45,10 @@ def failed_tunnel(): def new_ispyb_connection(): PrometheusMetrics.ispyb_connections.labels(_NAMESPACE).inc() + @staticmethod + def new_ispyb_connection_attempt(): + PrometheusMetrics.ispyb_connection_attempts.labels(_NAMESPACE).inc() + @staticmethod def failed_ispyb_connection(): PrometheusMetrics.ispyb_connection_failures.labels(_NAMESPACE).inc() diff --git a/api/remote_ispyb_connector.py b/api/remote_ispyb_connector.py index f28f2e20..329a840e 100644 --- a/api/remote_ispyb_connector.py +++ b/api/remote_ispyb_connector.py @@ -178,6 +178,7 @@ def remote_connect( ) logger.warning('Unexpected %s', repr(e)) connect_attempts += 1 + PrometheusMetrics.new_ispyb_connection_attempt() time.sleep(PYMYSQL_EXCEPTION_RECONNECT_DELAY_S) if self.conn is not None: From 3a7fad865aee5fc078ba32b112f8be21968d21ef Mon Sep 17 00:00:00 2001 From: Alan Christie Date: Wed, 22 May 2024 10:44:49 +0100 Subject: [PATCH 3/9] feat: Add cache hit/miss metrics --- api/prometheus_metrics.py | 18 ++++++++++++++++++ api/security.py | 3 +++ 2 files changed, 21 insertions(+) diff --git a/api/prometheus_metrics.py b/api/prometheus_metrics.py index 0dd2deff..35391a07 100644 --- a/api/prometheus_metrics.py +++ b/api/prometheus_metrics.py @@ -32,6 +32,16 @@ class PrometheusMetrics: 'Total number of ISpyB connection failures', labelnames=['k8s_namespace'], ) + proposal_cache_hit = Counter( + 'fragalysis_proposal_cache_hit', + 'Total number of proposal cache hits (avoiding new connections)', + labelnames=['k8s_namespace'], + ) + proposal_cache_miss = Counter( + 'fragalysis_proposal_cache_miss', + 'Total number of proposal cache misses (forcing a new connection)', + labelnames=['k8s_namespace'], + ) @staticmethod def new_tunnel(): @@ -52,3 +62,11 @@ def new_ispyb_connection_attempt(): @staticmethod def failed_ispyb_connection(): PrometheusMetrics.ispyb_connection_failures.labels(_NAMESPACE).inc() + + @staticmethod + def new_proposal_cache_hit(): + PrometheusMetrics.proposal_cache_hit.labels(_NAMESPACE).inc() + + @staticmethod + def new_proposal_cache_miss(): + PrometheusMetrics.proposal_cache_miss.labels(_NAMESPACE).inc() diff --git a/api/security.py b/api/security.py index 40677ee9..6adaaff1 100644 --- a/api/security.py +++ b/api/security.py @@ -237,12 +237,15 @@ def _run_query_with_connector(self, conn, user): def _get_proposals_for_user_from_ispyb(self, user): if CachedContent.has_expired(user.username): logger.info("Cache has expired for '%s'", user.username) + PrometheusMetrics.new_proposal_cache_miss() if conn := get_configured_connector(): logger.debug("Got a connector for '%s'", user.username) self._get_proposals_from_connector(user, conn) else: logger.warning("Failed to get a connector for '%s'", user.username) self._mark_cache_collection_failure(user) + else: + PrometheusMetrics.new_proposal_cache_hit() # The cache has either been updated, has not changed or is empty. # Return what we have for the user. Public (open) proposals From 4393b6c57e9b7a1b4a6b759cdbfe13a67767e538 Mon Sep 17 00:00:00 2001 From: Alan Christie Date: Wed, 22 May 2024 11:27:20 +0100 Subject: [PATCH 4/9] feat: Leaner metrics (and initial dashboard) --- api/prometheus_metrics.py | 24 +-- grafana/dashboard.json | 299 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 306 insertions(+), 17 deletions(-) create mode 100644 grafana/dashboard.json diff --git a/api/prometheus_metrics.py b/api/prometheus_metrics.py index 35391a07..e8fb39d1 100644 --- a/api/prometheus_metrics.py +++ b/api/prometheus_metrics.py @@ -1,72 +1,62 @@ """Prometheus metrics used by the fragalysis API module. """ -from django.conf import settings from prometheus_client import Counter -_NAMESPACE: str = settings.OUR_KUBERNETES_NAMESPACE or 'unknown' - class PrometheusMetrics: ssh_tunnels = Counter( 'fragalysis_ssh_tunnels', 'Total number of SSH tunnels created', - labelnames=['k8s_namespace'], ) ssh_tunnel_failures = Counter( 'fragalysis_ssh_tunnel_failures', 'Total number of SSH tunnel failures', - labelnames=['k8s_namespace'], ) ispyb_connections = Counter( 'fragalysis_ispyb_connections', 'Total number of ISpyB connections', - labelnames=['k8s_namespace'], ) ispyb_connection_attempts = Counter( 'fragalysis_ispyb_connection_attempts', 'Total number of ISpyB connection attempts (after initial failure)', - labelnames=['k8s_namespace'], ) ispyb_connection_failures = Counter( 'fragalysis_ispyb_connection_failures', 'Total number of ISpyB connection failures', - labelnames=['k8s_namespace'], ) proposal_cache_hit = Counter( 'fragalysis_proposal_cache_hit', 'Total number of proposal cache hits (avoiding new connections)', - labelnames=['k8s_namespace'], ) proposal_cache_miss = Counter( 'fragalysis_proposal_cache_miss', 'Total number of proposal cache misses (forcing a new connection)', - labelnames=['k8s_namespace'], ) @staticmethod def new_tunnel(): - PrometheusMetrics.ssh_tunnels.labels(_NAMESPACE).inc() + PrometheusMetrics.ssh_tunnels.inc() @staticmethod def failed_tunnel(): - PrometheusMetrics.ssh_tunnel_failures.labels(_NAMESPACE).inc() + PrometheusMetrics.ssh_tunnel_failures.inc() @staticmethod def new_ispyb_connection(): - PrometheusMetrics.ispyb_connections.labels(_NAMESPACE).inc() + PrometheusMetrics.ispyb_connections.inc() @staticmethod def new_ispyb_connection_attempt(): - PrometheusMetrics.ispyb_connection_attempts.labels(_NAMESPACE).inc() + PrometheusMetrics.ispyb_connection_attempts.inc() @staticmethod def failed_ispyb_connection(): - PrometheusMetrics.ispyb_connection_failures.labels(_NAMESPACE).inc() + PrometheusMetrics.ispyb_connection_failures.inc() @staticmethod def new_proposal_cache_hit(): - PrometheusMetrics.proposal_cache_hit.labels(_NAMESPACE).inc() + PrometheusMetrics.proposal_cache_hit.inc() @staticmethod def new_proposal_cache_miss(): - PrometheusMetrics.proposal_cache_miss.labels(_NAMESPACE).inc() + PrometheusMetrics.proposal_cache_miss.inc() diff --git a/grafana/dashboard.json b/grafana/dashboard.json new file mode 100644 index 00000000..1e302195 --- /dev/null +++ b/grafana/dashboard.json @@ -0,0 +1,299 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "9.1.5" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "The Fragalysis Stack Grafana Dashboard", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "{container=\"stack\", endpoint=\"http\", instance=\"10.42.1.105:80\", job=\"stack\", k8s_namespace=\"stack-alan-default\", namespace=\"stack-alan-default\", pod=\"stack-0\", service=\"stack\"}" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 10, + "x": 0, + "y": 0 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "builder", + "expr": "rate(fragalysis_ssh_tunnels_total[$__rate_interval])", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "SSH Tunnel Connections", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 10, + "x": 10, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(fragalysis_ispyb_connections_total[$__rate_interval])", + "legendFormat": "__auto", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "hide": false, + "refId": "B" + } + ], + "title": "ISpyB Connections", + "type": "timeseries" + } + ], + "schemaVersion": 37, + "style": "dark", + "tags": [ + "fragalysis" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Fragalysis", + "uid": "Ue0SYxPIk", + "version": 3, + "weekStart": "" +} From 2c1ef1fc4328a7813e97d05a141deae69ae56f52 Mon Sep 17 00:00:00 2001 From: Alan Christie Date: Wed, 22 May 2024 11:46:42 +0100 Subject: [PATCH 5/9] feat: Metrics now initialised --- api/prometheus_metrics.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/api/prometheus_metrics.py b/api/prometheus_metrics.py index e8fb39d1..fc268a38 100644 --- a/api/prometheus_metrics.py +++ b/api/prometheus_metrics.py @@ -4,34 +4,46 @@ class PrometheusMetrics: + """A static class to hold the Prometheus metrics for the fragalysis API module. + Each metric has its own static method to adjust it. + """ + + # Create, and initialise the metrics for this module ssh_tunnels = Counter( 'fragalysis_ssh_tunnels', 'Total number of SSH tunnels created', ) + ssh_tunnels.reset() ssh_tunnel_failures = Counter( 'fragalysis_ssh_tunnel_failures', 'Total number of SSH tunnel failures', ) + ssh_tunnel_failures.reset() ispyb_connections = Counter( 'fragalysis_ispyb_connections', 'Total number of ISpyB connections', ) + ispyb_connections.reset() ispyb_connection_attempts = Counter( 'fragalysis_ispyb_connection_attempts', 'Total number of ISpyB connection attempts (after initial failure)', ) + ispyb_connection_attempts.reset() ispyb_connection_failures = Counter( 'fragalysis_ispyb_connection_failures', 'Total number of ISpyB connection failures', ) + ispyb_connection_failures.reset() proposal_cache_hit = Counter( 'fragalysis_proposal_cache_hit', 'Total number of proposal cache hits (avoiding new connections)', ) + proposal_cache_hit.reset() proposal_cache_miss = Counter( 'fragalysis_proposal_cache_miss', 'Total number of proposal cache misses (forcing a new connection)', ) + proposal_cache_miss.reset() @staticmethod def new_tunnel(): From 3f61fb2d69a8e8081bb3c4db15835c9bb1fe546b Mon Sep 17 00:00:00 2001 From: Alan Christie Date: Wed, 22 May 2024 12:32:32 +0100 Subject: [PATCH 6/9] docs: Updated dashboard --- grafana/dashboard.json | 382 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 350 insertions(+), 32 deletions(-) diff --git a/grafana/dashboard.json b/grafana/dashboard.json index 1e302195..28c34c66 100644 --- a/grafana/dashboard.json +++ b/grafana/dashboard.json @@ -60,6 +60,17 @@ "links": [], "liveNow": false, "panels": [ + { + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 6, + "title": "Security (API)", + "type": "row" + }, { "datasource": { "type": "prometheus", @@ -85,7 +96,7 @@ "viz": false }, "lineInterpolation": "linear", - "lineWidth": 1, + "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" @@ -117,25 +128,31 @@ }, "overrides": [ { - "__systemRef": "hideSeriesFrom", "matcher": { - "id": "byNames", - "options": { - "mode": "exclude", - "names": [ - "{container=\"stack\", endpoint=\"http\", instance=\"10.42.1.105:80\", job=\"stack\", k8s_namespace=\"stack-alan-default\", namespace=\"stack-alan-default\", pod=\"stack-0\", service=\"stack\"}" - ], - "prefix": "All except:", - "readOnly": true + "id": "byFrameRefID", + "options": "A" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } } + ] + }, + { + "matcher": { + "id": "byFrameRefID", + "options": "B" }, "properties": [ { - "id": "custom.hideFrom", + "id": "color", "value": { - "legend": false, - "tooltip": false, - "viz": true + "fixedColor": "blue", + "mode": "fixed" } } ] @@ -143,12 +160,12 @@ ] }, "gridPos": { - "h": 8, - "w": 10, + "h": 7, + "w": 6, "x": 0, - "y": 0 + "y": 1 }, - "id": 4, + "id": 12, "options": { "legend": { "calcs": [], @@ -168,13 +185,25 @@ "uid": "${DS_PROMETHEUS}" }, "editorMode": "builder", - "expr": "rate(fragalysis_ssh_tunnels_total[$__rate_interval])", - "legendFormat": "__auto", + "expr": "rate(fragalysis_proposal_cache_miss_total{namespace=\"$Namespace\"}[$__rate_interval])", + "hide": false, + "legendFormat": "Miss", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "builder", + "expr": "rate(fragalysis_proposal_cache_hit_total{namespace=\"$Namespace\"}[$__rate_interval])", + "legendFormat": "Hit", "range": true, "refId": "A" } ], - "title": "SSH Tunnel Connections", + "title": "Cache Hit and Miss", "type": "timeseries" }, { @@ -185,7 +214,8 @@ "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" + "fixedColor": "green", + "mode": "fixed" }, "custom": { "axisCenteredZero": false, @@ -202,7 +232,7 @@ "viz": false }, "lineInterpolation": "linear", - "lineWidth": 1, + "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" @@ -232,13 +262,180 @@ ] } }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byFrameRefID", + "options": "A" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byFrameRefID", + "options": "B" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] }, "gridPos": { - "h": 8, - "w": 10, - "x": 10, - "y": 0 + "h": 7, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "builder", + "expr": "rate(fragalysis_ssh_tunnels_total{namespace=\"$Namespace\"}[$__rate_interval])", + "legendFormat": "Connections", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "builder", + "expr": "rate(fragalysis_ssh_tunnel_failures_total{namespace=\"$Namespace\"}[$__rate_interval])", + "hide": false, + "legendFormat": "Failures", + "range": true, + "refId": "B" + } + ], + "title": "SSH Tunnels", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "green", + "mode": "fixed" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 8, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byFrameRefID", + "options": "A" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byFrameRefID", + "options": "B" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 1 }, "id": 2, "options": { @@ -261,7 +458,7 @@ }, "editorMode": "code", "expr": "rate(fragalysis_ispyb_connections_total[$__rate_interval])", - "legendFormat": "__auto", + "legendFormat": "Connections", "range": true, "refId": "A" }, @@ -270,12 +467,109 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "editorMode": "builder", + "expr": "rate(fragalysis_ispyb_connection_failures_total{namespace=\"$Namespace\"}[$__rate_interval])", "hide": false, + "legendFormat": "Failures", + "range": true, "refId": "B" } ], "title": "ISpyB Connections", "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "orange", + "mode": "fixed" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "builder", + "expr": "rate(fragalysis_ispyb_connection_attempts_total{namespace=\"$Namespace\"}[$__rate_interval])", + "legendFormat": "Retries", + "range": true, + "refId": "A" + } + ], + "title": "ISPyB Connection Retries", + "type": "timeseries" } ], "schemaVersion": 37, @@ -284,16 +578,40 @@ "fragalysis" ], "templating": { - "list": [] + "list": [ + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(fragalysis_ispyb_connections, namespace)", + "description": "The kubernetes Namespace of the Fragalysis Stack of interest", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "Namespace", + "options": [], + "query": { + "query": "label_values(fragalysis_ispyb_connections, namespace)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] }, "time": { - "from": "now-6h", + "from": "now-1h", "to": "now" }, "timepicker": {}, "timezone": "", "title": "Fragalysis", "uid": "Ue0SYxPIk", - "version": 3, + "version": 8, "weekStart": "" } From e8f3b252030e11c877e5b6be37039722849146e9 Mon Sep 17 00:00:00 2001 From: Alan Christie Date: Wed, 22 May 2024 12:36:36 +0100 Subject: [PATCH 7/9] fix: Attempt to fix hit/miss metrics --- api/security.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/api/security.py b/api/security.py index 6adaaff1..988e4956 100644 --- a/api/security.py +++ b/api/security.py @@ -48,10 +48,12 @@ def has_expired(username) -> bool: # User's not known, # initialise an entry that will automatically expire CachedContent._timers[username] = now + PrometheusMetrics.new_proposal_cache_hit() if CachedContent._timers[username] <= now: has_expired = True # Expired, reset the expiry time CachedContent._timers[username] = now + CachedContent._cache_period + PrometheusMetrics.new_proposal_cache_miss() return has_expired @staticmethod @@ -237,15 +239,12 @@ def _run_query_with_connector(self, conn, user): def _get_proposals_for_user_from_ispyb(self, user): if CachedContent.has_expired(user.username): logger.info("Cache has expired for '%s'", user.username) - PrometheusMetrics.new_proposal_cache_miss() if conn := get_configured_connector(): logger.debug("Got a connector for '%s'", user.username) self._get_proposals_from_connector(user, conn) else: logger.warning("Failed to get a connector for '%s'", user.username) self._mark_cache_collection_failure(user) - else: - PrometheusMetrics.new_proposal_cache_hit() # The cache has either been updated, has not changed or is empty. # Return what we have for the user. Public (open) proposals From 6fb68ec79406570e71682c30b643d4c1037c526c Mon Sep 17 00:00:00 2001 From: Alan Christie Date: Wed, 22 May 2024 12:38:34 +0100 Subject: [PATCH 8/9] docs: New dashboard --- grafana/dashboard.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/grafana/dashboard.json b/grafana/dashboard.json index 28c34c66..2a58acd8 100644 --- a/grafana/dashboard.json +++ b/grafana/dashboard.json @@ -585,7 +585,7 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, - "definition": "label_values(fragalysis_ispyb_connections, namespace)", + "definition": "label_values(fragalysis_ispyb_connections_total, namespace)", "description": "The kubernetes Namespace of the Fragalysis Stack of interest", "hide": 0, "includeAll": false, @@ -593,7 +593,7 @@ "name": "Namespace", "options": [], "query": { - "query": "label_values(fragalysis_ispyb_connections, namespace)", + "query": "label_values(fragalysis_ispyb_connections_total, namespace)", "refId": "StandardVariableQuery" }, "refresh": 1, @@ -612,6 +612,6 @@ "timezone": "", "title": "Fragalysis", "uid": "Ue0SYxPIk", - "version": 8, + "version": 9, "weekStart": "" } From 1d0d894a52bc7195bb6dd9b83cf17bc30bdfaaab Mon Sep 17 00:00:00 2001 From: Alan Christie Date: Wed, 22 May 2024 12:57:25 +0100 Subject: [PATCH 9/9] docs: Dashboard tweak --- grafana/dashboard.json | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/grafana/dashboard.json b/grafana/dashboard.json index 2a58acd8..07e0326e 100644 --- a/grafana/dashboard.json +++ b/grafana/dashboard.json @@ -76,6 +76,7 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "The rate of hits and misses of the proposal cache", "fieldConfig": { "defaults": { "color": { @@ -203,7 +204,7 @@ "refId": "A" } ], - "title": "Cache Hit and Miss", + "title": "Proposal Cache", "type": "timeseries" }, { @@ -211,6 +212,7 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "The rate of SSH tunnel connections and errors", "fieldConfig": { "defaults": { "color": { @@ -322,7 +324,7 @@ }, "editorMode": "builder", "expr": "rate(fragalysis_ssh_tunnels_total{namespace=\"$Namespace\"}[$__rate_interval])", - "legendFormat": "Connections", + "legendFormat": "Success", "range": true, "refId": "A" }, @@ -347,6 +349,7 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "The rate of ISPyB (MySQL) connections and failures", "fieldConfig": { "defaults": { "color": { @@ -458,7 +461,7 @@ }, "editorMode": "code", "expr": "rate(fragalysis_ispyb_connections_total[$__rate_interval])", - "legendFormat": "Connections", + "legendFormat": "Success", "range": true, "refId": "A" }, @@ -475,7 +478,7 @@ "refId": "B" } ], - "title": "ISpyB Connections", + "title": "ISPyB Connection", "type": "timeseries" }, { @@ -483,6 +486,7 @@ "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, + "description": "The rate of ISPyB (MySQL) connection re-tries. Connections are retied for a number of times before failing.", "fieldConfig": { "defaults": { "color": { @@ -612,6 +616,6 @@ "timezone": "", "title": "Fragalysis", "uid": "Ue0SYxPIk", - "version": 9, + "version": 11, "weekStart": "" }