robusta-dev · aantn · Jun 29, 2021 · Jun 28, 2021 · Jun 28, 2021 · Jun 29, 2021
diff --git a/docs/user-guide/alerts.rst b/docs/user-guide/alerts.rst
@@ -146,6 +146,8 @@ Available enrichers
 
 **NodeCPUAnalysis:** provide deep analysis of node cpu usage
 
+**OOMKillerEnricher:** shows which pods were recently OOM Killed on a node
+
 **GraphEnricher:** display a graph of the Prometheus query which triggered the alert
 
 **StackOverflowEnricher:** add a button in Slack to search for the alert name on StackOverflow

diff --git a/playbooks/alerts_integration.py b/playbooks/alerts_integration.py
@@ -9,11 +9,14 @@
 
 from robusta.api import *
 from node_cpu_analysis import do_node_cpu_analysis
+from oom_killer import do_show_recent_oom_kills
+
 
 class GenParams(BaseModel):
     name: str
     params: Dict[Any,Any] = None
 
+
 class Silencer:
     params: Dict[Any,Any]
 
@@ -34,18 +37,20 @@ def __init__(self, params: Dict[Any, Any]):
             self.post_restart_silence = self.params.get("post_restart_silence")
 
     def silence(self, alert: PrometheusKubernetesAlert) -> bool:
-        if not alert.obj or not alert.obj.kind == "Pod":
+        if not alert.pod:
             return False # Silencing only pod alerts on NodeRestartSilencer
 
-        node: Node = Node.readNode(alert.obj.spec.nodeName).obj
+        # TODO: do we already have alert.Node here?
+        node: Node = Node.readNode(alert.pod.spec.nodeName).obj
         if not node:
-            logging.warning(f"Node {alert.obj.spec.nodeName} not found for NodeRestartSilencer for {alert}")
+            logging.warning(f"Node {alert.pod.spec.nodeName} not found for NodeRestartSilencer for {alert}")
             return False
 
-        last_transition_times = [condition.lastTransitionTime for condition in node.status.conditions if condition.type == "Ready"]
+        last_transition_times = [condition.lastTransitionTime for condition in node.status.conditions
+                                 if condition.type == "Ready"]
         if last_transition_times and last_transition_times[0]:
             node_start_time_str =  last_transition_times[0]
-        else: # if no ready time, take creation time
+        else:  # if no ready time, take creation time
             node_start_time_str = node.metadata.creationTimestamp
 
         node_start_time = datetime.strptime(node_start_time_str, '%Y-%m-%dT%H:%M:%SZ')
@@ -55,7 +60,7 @@ def silence(self, alert: PrometheusKubernetesAlert) -> bool:
 class Enricher:
     params: Dict[Any, Any] = None
 
-    def __init__(self, params: Dict[Any,Any]):
+    def __init__(self, params: Dict[Any, Any]):
         self.params = params
 
     def enrich(self, alert: PrometheusKubernetesAlert):
@@ -111,7 +116,11 @@ def enrich(self, alert: PrometheusKubernetesAlert):
 class NodeCPUEnricher (Enricher):
 
     def enrich(self, alert: PrometheusKubernetesAlert):
-        alert.report_blocks.extend(do_node_cpu_analysis(alert.obj.metadata.name))
+        if not alert.node:
+            logging.error(f"NodeCPUEnricher was called on alert without node metadata: {alert.alert}")
+            return
+
+        alert.report_blocks.extend(do_node_cpu_analysis(alert.node))
         alert.report_title = f"{alert.alert.labels.get('alertname')} Node CPU Analysis"
 
 
@@ -143,6 +152,15 @@ def enrich(self, alert: PrometheusKubernetesAlert):
                                                  {"search_term": alert_name}))
 
 
+class OOMKillerEnricher (Enricher):
+
+    def enrich(self, alert: PrometheusKubernetesAlert):
+        if not alert.node:
+            logging.error(f"cannot run OOMKillerEnricher on alert with no node object: {alert}")
+            return
+        alert.report_blocks.extend(do_show_recent_oom_kills(alert.node))
+
+
 DEFAULT_ENRICHER = "AlertDefaults"
 
 silencers = {}
@@ -153,6 +171,7 @@ def enrich(self, alert: PrometheusKubernetesAlert):
 enrichers["GraphEnricher"] = GraphEnricher
 enrichers["StackOverflowEnricher"] = StackOverflowEnricher
 enrichers["NodeCPUAnalysis"] = NodeCPUEnricher
+enrichers["OOMKillerEnricher"] = OOMKillerEnricher
 
 
 class AlertConfig(BaseModel):
@@ -177,12 +196,11 @@ def alerts_integration(alert: PrometheusKubernetesAlert, config: AlertsIntegrati
     alert_name = alert.alert.labels.get("alertname")
 
     # filter out the dummy watchdog alert that prometheus constantly sends so that you know it is alive
-    if alert_name == "Watchdog" and alert.alert.labels.get("severity") == "none":
+    if alert_name == "Watchdog" and alert.alert_severity == "none":
         logging.debug(f"skipping watchdog alert {alert}")
         return
 
-    logging.info(
-        f'running alerts_integration alert - alert: {alert.alert} pod: {alert.obj.metadata.name if alert.obj is not None else "None!"}')
+    logging.info(f'running alerts_integration alert - alert: {alert.alert}')
 
     # TODO: should we really handle this as a list as opposed to looking for the first one that matches?
     alert_configs = [alert_config for alert_config in config.alerts_config if alert_config.alert_name == alert_name]
@@ -206,4 +224,6 @@ def alerts_integration(alert: PrometheusKubernetesAlert, config: AlertsIntegrati
             enricher_class(enricher_config.params).enrich(alert)
 
     if alert.report_blocks or alert.report_title or alert.report_attachment_blocks:
+        if not alert.report_title:
+            alert.report_title = alert_name
         send_to_slack(alert)
diff --git a/playbooks/node_cpu_analysis.py b/playbooks/node_cpu_analysis.py
@@ -10,7 +10,7 @@ class NodeCPUAnalysisParams(BaseModel):
     slack_channel: str = ""
 
 
-def do_node_cpu_analysis(node: str, prometheus_url: str = None) -> List[BaseBlock]:
+def do_node_cpu_analysis(node: Node, prometheus_url: str = None) -> List[BaseBlock]:
     analyzer = NodeAnalyzer(node, prometheus_url)
 
     threshold = 0.005
@@ -23,7 +23,7 @@ def do_node_cpu_analysis(node: str, prometheus_url: str = None) -> List[BaseBloc
     all_pod_names = list(set(per_pod_usage_unbounded.keys()).union(per_pod_request.keys()))
 
     treemap = pygal.Treemap(style=ChosenStyle)
-    treemap.title = f'CPU Usage on Node {node}'
+    treemap.title = f'CPU Usage on Node {node.metadata.name}'
     treemap.value_formatter = lambda x: f"{int(x * 100)}%"
     treemap.add("Non-container usage", [non_container_cpu_usage])
     treemap.add("Free CPU", [1 - total_cpu_usage])
@@ -32,7 +32,7 @@ def do_node_cpu_analysis(node: str, prometheus_url: str = None) -> List[BaseBloc
 
     MISSING_VALUE = -0.001
     bar_chart = pygal.Bar(x_label_rotation=-40, style=ChosenStyle)
-    bar_chart.title = f'Actual Vs Requested vCPUs on Node {node}'
+    bar_chart.title = f'Actual Vs Requested vCPUs on Node {node.metadata.name}'
     bar_chart.x_labels = all_pod_names
     bar_chart.value_formatter = lambda x: f"{x:.2f} vCPU" if x != MISSING_VALUE else "no data"
     bar_chart.add('Actual CPU Usage',
@@ -61,7 +61,9 @@ def do_node_cpu_analysis(node: str, prometheus_url: str = None) -> List[BaseBloc
 @on_manual_trigger
 def node_cpu_analysis(event: ManualTriggerEvent):
     params = NodeCPUAnalysisParams(**event.data)
+    node = Node().read(name=params.node)
+
     event.report_title = f"Node CPU Usage Report for {params.node}"
     event.slack_channel = params.slack_channel
-    event.report_blocks = do_node_cpu_analysis(params.node, params.prometheus_url)
+    event.report_blocks = do_node_cpu_analysis(node, params.prometheus_url)
     send_to_slack(event)
diff --git a/playbooks/oom_killer.py b/playbooks/oom_killer.py
@@ -0,0 +1,57 @@
+from robusta.api import *
+from robusta.integrations.kubernetes.api_client_utils import parse_kubernetes_datetime
+from datetime import datetime, timezone
+from collections import namedtuple
+import humanize
+
+
+class OOMKillerParams (BaseModel):
+    node_name: str = None
+    slack_channel: str
+
+
+def is_oom_status(status: ContainerStatus):
+    if not status.lastState:
+        return False
+    if not status.lastState.terminated:
+        return False
+    return status.lastState.terminated.reason == "OOMKilled"
+
+
+OOMKill = namedtuple('OOMKill', ['datetime', 'message'])
+
+
+def do_show_recent_oom_kills(node: Node) -> List[BaseBlock]:
+    results: PodList = Pod.listPodForAllNamespaces(field_selector=f"spec.nodeName={node.metadata.name}").obj
+
+    oom_kills: List[OOMKill] = []
+    for pod in results.items:
+        oom_statuses = filter(is_oom_status, pod.status.containerStatuses)
+        for status in oom_statuses:
+            dt = parse_kubernetes_datetime(status.lastState.terminated.finishedAt)
+            time_ago = humanize.naturaltime(datetime.now(timezone.utc)-dt)
+            msg = f"*{time_ago}*: pod={pod.metadata.name}; container={status.name}; image={status.image}"
+            oom_kills.append(OOMKill(dt, msg))
+
+    oom_kills.sort(key=lambda o: o.datetime)
+
+    if oom_kills:
+        logging.info(f"found at least one oom killer on {node.metadata.name}")
+        return [ListBlock([oom.message for oom in oom_kills])]
+    else:
+        logging.info(f"found no oom killers on {node.metadata.name}")
+        return []
+
+
+@on_manual_trigger
+def show_recent_oom_kills(event: ManualTriggerEvent):
+    params = OOMKillerParams(**event.data)
+    node = Node().read(name=params.node_name)
+    blocks = do_show_recent_oom_kills(node)
+    if blocks:
+        event.report_blocks.extend(blocks)
+        event.slack_channel = params.slack_channel
+        event.report_title = f"Latest OOM Kills on {params.node_name}"
+        send_to_slack(event)
+
+
diff --git a/playbooks/requirements.txt b/playbooks/requirements.txt
@@ -4,4 +4,5 @@ kubernetes
 prometheus-api-client
 pygal
 tinycss
-cssselect
+cssselect
+humanize
diff --git a/playbooks/simple_examples.py b/playbooks/simple_examples.py
@@ -7,28 +7,28 @@ class HighCpuConfig(BaseModel):
 
 @on_report_callback
 def high_cpu_delete_confirmation_handler(event: ReportCallbackEvent):
-    logging.info(f'high_cpu_delete_confirmation_handler {event.context}')
+    logging.info(f'high_cpu_delete_confirmation_handler {event.source_context}')
 
 
 @on_report_callback
 def high_cpu_profile_confirmation_handler(event: ReportCallbackEvent):
-    logging.info(f'high_cpu_profile_confirmation_handler {event.context}')
+    logging.info(f'high_cpu_profile_confirmation_handler {event.source_context}')
 
 
 @on_pod_prometheus_alert(alert_name="HighCPUAlert", status="firing")
 def slack_confirmation_on_cpu(event: PrometheusKubernetesAlert, config: HighCpuConfig):
-    logging.info(f'running slack_confirmation_on_cpu alert - alert: {event.alert} pod: {event.obj}')
+    logging.info(f'running slack_confirmation_on_cpu alert - alert: {event.alert} pod: {event.pod}')
 
     choices = {
         'delete pod': high_cpu_delete_confirmation_handler,
         'profile pod': high_cpu_profile_confirmation_handler
     }
     context = {
-        'pod_name': event.obj.metadata.name,
-        'namespace': event.obj.metadata.namespace
+        'pod_name': event.pod.metadata.name,
+        'namespace': event.pod.metadata.namespace
     }
 
-    event.report_title = f"Pod {event.obj.metadata.name} has high cpu"
+    event.report_title = f"Pod {event.pod.metadata.name} has high cpu"
     event.slack_channel = config.slack_channel
     event.report_blocks.extend([
         CallbackBlock(choices, context)

diff --git a/src/robusta/integrations/kubernetes/api_client_utils.py b/src/robusta/integrations/kubernetes/api_client_utils.py
@@ -1,3 +1,4 @@
+import datetime
 import logging
 import os
 import re
@@ -157,3 +158,6 @@ def to_kubernetes_name(name, prefix=""):
     unique_id = str(time.time()).replace('.', '-')
     safe_name = re.sub("[^0-9a-zA-Z\\-]+", "-", name)
     return f"{prefix}{safe_name}-{unique_id}"[:63]
+
+def parse_kubernetes_datetime(k8s_datetime: str) -> datetime.datetime:
+    return datetime.datetime.strptime(k8s_datetime, "%Y-%m-%dT%H:%M:%S%z")
diff --git a/src/robusta/integrations/kubernetes/base_triggers.py b/src/robusta/integrations/kubernetes/base_triggers.py
@@ -2,6 +2,7 @@
 from functools import wraps
 
 import hikaru
+from hikaru.model import *
 from ...core.model.cloud_event import *
 from ...core.model.playbook_hash import playbook_hash
 from ...core.model.runner_config import *

diff --git a/src/robusta/integrations/kubernetes/custom_models.py b/src/robusta/integrations/kubernetes/custom_models.py
@@ -4,7 +4,7 @@
 import hikaru
 import json
 import yaml
-from hikaru.model import Pod, PodList, Container, PodSpec, ObjectMeta, SecurityContext, Capabilities, Deployment, JobSpec, PodTemplateSpec
+from hikaru.model import *  # *-import is necessary for hikaru subclasses to work
 from pydantic import BaseModel
 
 from .api_client_utils import *

diff --git a/src/robusta/integrations/prometheus/models.py b/src/robusta/integrations/prometheus/models.py
@@ -6,7 +6,7 @@
 from hikaru.model import Node
 
 from ...core.model.events import BaseEvent
-from ..kubernetes.custom_models import RobustaPod, RobustaDeployment
+from ..kubernetes.custom_models import RobustaPod, RobustaDeployment, RobustaJob
 
 
 # for parsing incoming data
@@ -34,7 +34,14 @@ class PrometheusEvent(BaseModel):
     status: str
 
 
+# everything here needs to be optional due to annoying subtleties regarding dataclass inheritance
+# see explanation in the code for BaseEvent
 @dataclass
 class PrometheusKubernetesAlert (BaseEvent):
     alert: Optional[PrometheusAlert] = None
-    obj: Union[RobustaPod, Node, RobustaDeployment, None] = None
+    alert_name: Optional[str] = None
+    alert_severity: Optional[str] = None
+    node: Optional[Node] = None
+    pod: Optional[RobustaPod] = None
+    deployment: Optional[RobustaDeployment] = None
+    job: Optional[RobustaJob] = None