diff --git a/api/prometheus_metrics.py b/api/prometheus_metrics.py new file mode 100644 index 00000000..fc268a38 --- /dev/null +++ b/api/prometheus_metrics.py @@ -0,0 +1,74 @@ +"""Prometheus metrics used by the fragalysis API module. +""" +from prometheus_client import Counter + + +class PrometheusMetrics: + """A static class to hold the Prometheus metrics for the fragalysis API module. + Each metric has its own static method to adjust it. + """ + + # Create, and initialise the metrics for this module + ssh_tunnels = Counter( + 'fragalysis_ssh_tunnels', + 'Total number of SSH tunnels created', + ) + ssh_tunnels.reset() + ssh_tunnel_failures = Counter( + 'fragalysis_ssh_tunnel_failures', + 'Total number of SSH tunnel failures', + ) + ssh_tunnel_failures.reset() + ispyb_connections = Counter( + 'fragalysis_ispyb_connections', + 'Total number of ISpyB connections', + ) + ispyb_connections.reset() + ispyb_connection_attempts = Counter( + 'fragalysis_ispyb_connection_attempts', + 'Total number of ISpyB connection attempts (after initial failure)', + ) + ispyb_connection_attempts.reset() + ispyb_connection_failures = Counter( + 'fragalysis_ispyb_connection_failures', + 'Total number of ISpyB connection failures', + ) + ispyb_connection_failures.reset() + proposal_cache_hit = Counter( + 'fragalysis_proposal_cache_hit', + 'Total number of proposal cache hits (avoiding new connections)', + ) + proposal_cache_hit.reset() + proposal_cache_miss = Counter( + 'fragalysis_proposal_cache_miss', + 'Total number of proposal cache misses (forcing a new connection)', + ) + proposal_cache_miss.reset() + + @staticmethod + def new_tunnel(): + PrometheusMetrics.ssh_tunnels.inc() + + @staticmethod + def failed_tunnel(): + PrometheusMetrics.ssh_tunnel_failures.inc() + + @staticmethod + def new_ispyb_connection(): + PrometheusMetrics.ispyb_connections.inc() + + @staticmethod + def new_ispyb_connection_attempt(): + PrometheusMetrics.ispyb_connection_attempts.inc() + + @staticmethod + def failed_ispyb_connection(): + PrometheusMetrics.ispyb_connection_failures.inc() + + @staticmethod + def new_proposal_cache_hit(): + PrometheusMetrics.proposal_cache_hit.inc() + + @staticmethod + def new_proposal_cache_miss(): + PrometheusMetrics.proposal_cache_miss.inc() diff --git a/api/remote_ispyb_connector.py b/api/remote_ispyb_connector.py index c27bb5a8..329a840e 100644 --- a/api/remote_ispyb_connector.py +++ b/api/remote_ispyb_connector.py @@ -13,6 +13,8 @@ ) from pymysql.err import OperationalError +from .prometheus_metrics import PrometheusMetrics + logger: logging.Logger = logging.getLogger(__name__) # Timeout to allow the pymysql.connect() method to connect to the DB. @@ -176,15 +178,18 @@ def remote_connect( ) logger.warning('Unexpected %s', repr(e)) connect_attempts += 1 + PrometheusMetrics.new_ispyb_connection_attempt() time.sleep(PYMYSQL_EXCEPTION_RECONNECT_DELAY_S) if self.conn is not None: if connect_attempts > 0: logger.info('Connected') + PrometheusMetrics.new_ispyb_connection() self.conn.autocommit = True else: if connect_attempts > 0: logger.info('Failed to connect') + PrometheusMetrics.failed_ispyb_connection() self.server.stop() raise ISPyBConnectionException self.last_activity_ts = time.time() diff --git a/api/security.py b/api/security.py index 8ecf6976..988e4956 100644 --- a/api/security.py +++ b/api/security.py @@ -17,6 +17,7 @@ from viewer.models import Project +from .prometheus_metrics import PrometheusMetrics from .remote_ispyb_connector import SSHConnector logger: logging.Logger = logging.getLogger(__name__) @@ -47,10 +48,12 @@ def has_expired(username) -> bool: # User's not known, # initialise an entry that will automatically expire CachedContent._timers[username] = now + PrometheusMetrics.new_proposal_cache_hit() if CachedContent._timers[username] <= now: has_expired = True # Expired, reset the expiry time CachedContent._timers[username] = now + CachedContent._cache_period + PrometheusMetrics.new_proposal_cache_miss() return has_expired @staticmethod @@ -105,8 +108,10 @@ def get_remote_conn(force_error_display=False) -> Optional[SSHConnector]: logger.exception("Got the following exception creating Connector...") if conn: logger.debug("Got remote connector") + PrometheusMetrics.new_tunnel() else: logger.debug("Failed to get a remote connector") + PrometheusMetrics.failed_tunnel() return conn @@ -140,8 +145,10 @@ def get_conn(force_error_display=False) -> Optional[Connector]: logger.exception("Got the following exception creating Connector...") if conn: logger.debug("Got connector") + PrometheusMetrics.new_ispyb_connection() else: logger.debug("Did not get a connector") + PrometheusMetrics.failed_ispyb_connection() return conn diff --git a/fragalysis/settings.py b/fragalysis/settings.py index ef3775f1..427a8827 100644 --- a/fragalysis/settings.py +++ b/fragalysis/settings.py @@ -67,7 +67,7 @@ import os import sys from datetime import timedelta -from typing import List +from typing import List, Optional import sentry_sdk from sentry_sdk.integrations.celery import CeleryIntegration @@ -535,6 +535,16 @@ NEO4J_QUERY: str = os.environ.get("NEO4J_QUERY", "neo4j") NEO4J_AUTH: str = os.environ.get("NEO4J_AUTH", "neo4j/neo4j") +# Does it look like we're running in Kubernetes? +# If so, let's get the namespace we're in - it will provide +# useful discrimination material in log/metrics messages. +# If there is no apparent namespace the variable will be 'None'. +OUR_KUBERNETES_NAMESPACE: Optional[str] = None +_NS_FILENAME: str = '/var/run/secrets/kubernetes.io/serviceaccount/namespace' +if os.path.isfile(_NS_FILENAME): + with open(_NS_FILENAME, 'rt', encoding='utf8') as ns_file: + OUR_KUBERNETES_NAMESPACE = ns_file.read().strip() + # These flags are used in the upload_tset form as follows. # Proposal Supported | Proposal Required | Proposal / View fields # Y | Y | Shown / Required diff --git a/grafana/dashboard.json b/grafana/dashboard.json new file mode 100644 index 00000000..07e0326e --- /dev/null +++ b/grafana/dashboard.json @@ -0,0 +1,621 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "9.1.5" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "The Fragalysis Stack Grafana Dashboard", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 6, + "title": "Security (API)", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The rate of hits and misses of the proposal cache", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byFrameRefID", + "options": "A" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byFrameRefID", + "options": "B" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 12, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "builder", + "expr": "rate(fragalysis_proposal_cache_miss_total{namespace=\"$Namespace\"}[$__rate_interval])", + "hide": false, + "legendFormat": "Miss", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "builder", + "expr": "rate(fragalysis_proposal_cache_hit_total{namespace=\"$Namespace\"}[$__rate_interval])", + "legendFormat": "Hit", + "range": true, + "refId": "A" + } + ], + "title": "Proposal Cache", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The rate of SSH tunnel connections and errors", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "green", + "mode": "fixed" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byFrameRefID", + "options": "A" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byFrameRefID", + "options": "B" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "builder", + "expr": "rate(fragalysis_ssh_tunnels_total{namespace=\"$Namespace\"}[$__rate_interval])", + "legendFormat": "Success", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "builder", + "expr": "rate(fragalysis_ssh_tunnel_failures_total{namespace=\"$Namespace\"}[$__rate_interval])", + "hide": false, + "legendFormat": "Failures", + "range": true, + "refId": "B" + } + ], + "title": "SSH Tunnels", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The rate of ISPyB (MySQL) connections and failures", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "green", + "mode": "fixed" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 8, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byFrameRefID", + "options": "A" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byFrameRefID", + "options": "B" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "rate(fragalysis_ispyb_connections_total[$__rate_interval])", + "legendFormat": "Success", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "builder", + "expr": "rate(fragalysis_ispyb_connection_failures_total{namespace=\"$Namespace\"}[$__rate_interval])", + "hide": false, + "legendFormat": "Failures", + "range": true, + "refId": "B" + } + ], + "title": "ISPyB Connection", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "The rate of ISPyB (MySQL) connection re-tries. Connections are retied for a number of times before failing.", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "orange", + "mode": "fixed" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "builder", + "expr": "rate(fragalysis_ispyb_connection_attempts_total{namespace=\"$Namespace\"}[$__rate_interval])", + "legendFormat": "Retries", + "range": true, + "refId": "A" + } + ], + "title": "ISPyB Connection Retries", + "type": "timeseries" + } + ], + "schemaVersion": 37, + "style": "dark", + "tags": [ + "fragalysis" + ], + "templating": { + "list": [ + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(fragalysis_ispyb_connections_total, namespace)", + "description": "The kubernetes Namespace of the Fragalysis Stack of interest", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "Namespace", + "options": [], + "query": { + "query": "label_values(fragalysis_ispyb_connections_total, namespace)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Fragalysis", + "uid": "Ue0SYxPIk", + "version": 11, + "weekStart": "" +}