Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Alerts integration improvements and oom killer #9

Merged
merged 6 commits into from
Jun 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/user-guide/alerts.rst
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@ Available enrichers

**NodeCPUAnalysis:** provide deep analysis of node cpu usage

**OOMKillerEnricher:** shows which pods were recently OOM Killed on a node

**GraphEnricher:** display a graph of the Prometheus query which triggered the alert

**StackOverflowEnricher:** add a button in Slack to search for the alert name on StackOverflow
Expand Down
40 changes: 30 additions & 10 deletions playbooks/alerts_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,14 @@

from robusta.api import *
from node_cpu_analysis import do_node_cpu_analysis
from oom_killer import do_show_recent_oom_kills


class GenParams(BaseModel):
name: str
params: Dict[Any,Any] = None


class Silencer:
params: Dict[Any,Any]

Expand All @@ -34,18 +37,20 @@ def __init__(self, params: Dict[Any, Any]):
self.post_restart_silence = self.params.get("post_restart_silence")

def silence(self, alert: PrometheusKubernetesAlert) -> bool:
if not alert.obj or not alert.obj.kind == "Pod":
if not alert.pod:
return False # Silencing only pod alerts on NodeRestartSilencer

node: Node = Node.readNode(alert.obj.spec.nodeName).obj
# TODO: do we already have alert.Node here?
node: Node = Node.readNode(alert.pod.spec.nodeName).obj
if not node:
logging.warning(f"Node {alert.obj.spec.nodeName} not found for NodeRestartSilencer for {alert}")
logging.warning(f"Node {alert.pod.spec.nodeName} not found for NodeRestartSilencer for {alert}")
return False

last_transition_times = [condition.lastTransitionTime for condition in node.status.conditions if condition.type == "Ready"]
last_transition_times = [condition.lastTransitionTime for condition in node.status.conditions
if condition.type == "Ready"]
if last_transition_times and last_transition_times[0]:
node_start_time_str = last_transition_times[0]
else: # if no ready time, take creation time
else: # if no ready time, take creation time
node_start_time_str = node.metadata.creationTimestamp

node_start_time = datetime.strptime(node_start_time_str, '%Y-%m-%dT%H:%M:%SZ')
Expand All @@ -55,7 +60,7 @@ def silence(self, alert: PrometheusKubernetesAlert) -> bool:
class Enricher:
params: Dict[Any, Any] = None

def __init__(self, params: Dict[Any,Any]):
def __init__(self, params: Dict[Any, Any]):
self.params = params

def enrich(self, alert: PrometheusKubernetesAlert):
Expand Down Expand Up @@ -111,7 +116,11 @@ def enrich(self, alert: PrometheusKubernetesAlert):
class NodeCPUEnricher (Enricher):

def enrich(self, alert: PrometheusKubernetesAlert):
alert.report_blocks.extend(do_node_cpu_analysis(alert.obj.metadata.name))
if not alert.node:
logging.error(f"NodeCPUEnricher was called on alert without node metadata: {alert.alert}")
return

alert.report_blocks.extend(do_node_cpu_analysis(alert.node))
alert.report_title = f"{alert.alert.labels.get('alertname')} Node CPU Analysis"


Expand Down Expand Up @@ -143,6 +152,15 @@ def enrich(self, alert: PrometheusKubernetesAlert):
{"search_term": alert_name}))


class OOMKillerEnricher (Enricher):

def enrich(self, alert: PrometheusKubernetesAlert):
if not alert.node:
logging.error(f"cannot run OOMKillerEnricher on alert with no node object: {alert}")
return
alert.report_blocks.extend(do_show_recent_oom_kills(alert.node))


DEFAULT_ENRICHER = "AlertDefaults"

silencers = {}
Expand All @@ -153,6 +171,7 @@ def enrich(self, alert: PrometheusKubernetesAlert):
enrichers["GraphEnricher"] = GraphEnricher
enrichers["StackOverflowEnricher"] = StackOverflowEnricher
enrichers["NodeCPUAnalysis"] = NodeCPUEnricher
enrichers["OOMKillerEnricher"] = OOMKillerEnricher


class AlertConfig(BaseModel):
Expand All @@ -177,12 +196,11 @@ def alerts_integration(alert: PrometheusKubernetesAlert, config: AlertsIntegrati
alert_name = alert.alert.labels.get("alertname")

# filter out the dummy watchdog alert that prometheus constantly sends so that you know it is alive
if alert_name == "Watchdog" and alert.alert.labels.get("severity") == "none":
if alert_name == "Watchdog" and alert.alert_severity == "none":
logging.debug(f"skipping watchdog alert {alert}")
return

logging.info(
f'running alerts_integration alert - alert: {alert.alert} pod: {alert.obj.metadata.name if alert.obj is not None else "None!"}')
logging.info(f'running alerts_integration alert - alert: {alert.alert}')

# TODO: should we really handle this as a list as opposed to looking for the first one that matches?
alert_configs = [alert_config for alert_config in config.alerts_config if alert_config.alert_name == alert_name]
Expand All @@ -206,4 +224,6 @@ def alerts_integration(alert: PrometheusKubernetesAlert, config: AlertsIntegrati
enricher_class(enricher_config.params).enrich(alert)

if alert.report_blocks or alert.report_title or alert.report_attachment_blocks:
if not alert.report_title:
alert.report_title = alert_name
send_to_slack(alert)
10 changes: 6 additions & 4 deletions playbooks/node_cpu_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class NodeCPUAnalysisParams(BaseModel):
slack_channel: str = ""


def do_node_cpu_analysis(node: str, prometheus_url: str = None) -> List[BaseBlock]:
def do_node_cpu_analysis(node: Node, prometheus_url: str = None) -> List[BaseBlock]:
analyzer = NodeAnalyzer(node, prometheus_url)

threshold = 0.005
Expand All @@ -23,7 +23,7 @@ def do_node_cpu_analysis(node: str, prometheus_url: str = None) -> List[BaseBloc
all_pod_names = list(set(per_pod_usage_unbounded.keys()).union(per_pod_request.keys()))

treemap = pygal.Treemap(style=ChosenStyle)
treemap.title = f'CPU Usage on Node {node}'
treemap.title = f'CPU Usage on Node {node.metadata.name}'
treemap.value_formatter = lambda x: f"{int(x * 100)}%"
treemap.add("Non-container usage", [non_container_cpu_usage])
treemap.add("Free CPU", [1 - total_cpu_usage])
Expand All @@ -32,7 +32,7 @@ def do_node_cpu_analysis(node: str, prometheus_url: str = None) -> List[BaseBloc

MISSING_VALUE = -0.001
bar_chart = pygal.Bar(x_label_rotation=-40, style=ChosenStyle)
bar_chart.title = f'Actual Vs Requested vCPUs on Node {node}'
bar_chart.title = f'Actual Vs Requested vCPUs on Node {node.metadata.name}'
bar_chart.x_labels = all_pod_names
bar_chart.value_formatter = lambda x: f"{x:.2f} vCPU" if x != MISSING_VALUE else "no data"
bar_chart.add('Actual CPU Usage',
Expand Down Expand Up @@ -61,7 +61,9 @@ def do_node_cpu_analysis(node: str, prometheus_url: str = None) -> List[BaseBloc
@on_manual_trigger
def node_cpu_analysis(event: ManualTriggerEvent):
params = NodeCPUAnalysisParams(**event.data)
node = Node().read(name=params.node)

event.report_title = f"Node CPU Usage Report for {params.node}"
event.slack_channel = params.slack_channel
event.report_blocks = do_node_cpu_analysis(params.node, params.prometheus_url)
event.report_blocks = do_node_cpu_analysis(node, params.prometheus_url)
send_to_slack(event)
57 changes: 57 additions & 0 deletions playbooks/oom_killer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from robusta.api import *
from robusta.integrations.kubernetes.api_client_utils import parse_kubernetes_datetime
from datetime import datetime, timezone
from collections import namedtuple
import humanize


class OOMKillerParams (BaseModel):
node_name: str = None
slack_channel: str


def is_oom_status(status: ContainerStatus):
if not status.lastState:
return False
if not status.lastState.terminated:
return False
return status.lastState.terminated.reason == "OOMKilled"


OOMKill = namedtuple('OOMKill', ['datetime', 'message'])


def do_show_recent_oom_kills(node: Node) -> List[BaseBlock]:
results: PodList = Pod.listPodForAllNamespaces(field_selector=f"spec.nodeName={node.metadata.name}").obj

oom_kills: List[OOMKill] = []
for pod in results.items:
oom_statuses = filter(is_oom_status, pod.status.containerStatuses)
for status in oom_statuses:
dt = parse_kubernetes_datetime(status.lastState.terminated.finishedAt)
time_ago = humanize.naturaltime(datetime.now(timezone.utc)-dt)
msg = f"*{time_ago}*: pod={pod.metadata.name}; container={status.name}; image={status.image}"
oom_kills.append(OOMKill(dt, msg))

oom_kills.sort(key=lambda o: o.datetime)

if oom_kills:
logging.info(f"found at least one oom killer on {node.metadata.name}")
return [ListBlock([oom.message for oom in oom_kills])]
else:
logging.info(f"found no oom killers on {node.metadata.name}")
return []


@on_manual_trigger
def show_recent_oom_kills(event: ManualTriggerEvent):
params = OOMKillerParams(**event.data)
node = Node().read(name=params.node_name)
blocks = do_show_recent_oom_kills(node)
if blocks:
event.report_blocks.extend(blocks)
event.slack_channel = params.slack_channel
event.report_title = f"Latest OOM Kills on {params.node_name}"
send_to_slack(event)


3 changes: 2 additions & 1 deletion playbooks/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ kubernetes
prometheus-api-client
pygal
tinycss
cssselect
cssselect
humanize
12 changes: 6 additions & 6 deletions playbooks/simple_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,28 @@ class HighCpuConfig(BaseModel):

@on_report_callback
def high_cpu_delete_confirmation_handler(event: ReportCallbackEvent):
logging.info(f'high_cpu_delete_confirmation_handler {event.context}')
logging.info(f'high_cpu_delete_confirmation_handler {event.source_context}')


@on_report_callback
def high_cpu_profile_confirmation_handler(event: ReportCallbackEvent):
logging.info(f'high_cpu_profile_confirmation_handler {event.context}')
logging.info(f'high_cpu_profile_confirmation_handler {event.source_context}')


@on_pod_prometheus_alert(alert_name="HighCPUAlert", status="firing")
def slack_confirmation_on_cpu(event: PrometheusKubernetesAlert, config: HighCpuConfig):
logging.info(f'running slack_confirmation_on_cpu alert - alert: {event.alert} pod: {event.obj}')
logging.info(f'running slack_confirmation_on_cpu alert - alert: {event.alert} pod: {event.pod}')

choices = {
'delete pod': high_cpu_delete_confirmation_handler,
'profile pod': high_cpu_profile_confirmation_handler
}
context = {
'pod_name': event.obj.metadata.name,
'namespace': event.obj.metadata.namespace
'pod_name': event.pod.metadata.name,
'namespace': event.pod.metadata.namespace
}

event.report_title = f"Pod {event.obj.metadata.name} has high cpu"
event.report_title = f"Pod {event.pod.metadata.name} has high cpu"
event.slack_channel = config.slack_channel
event.report_blocks.extend([
CallbackBlock(choices, context)
Expand Down
4 changes: 4 additions & 0 deletions src/robusta/integrations/kubernetes/api_client_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import datetime
import logging
import os
import re
Expand Down Expand Up @@ -157,3 +158,6 @@ def to_kubernetes_name(name, prefix=""):
unique_id = str(time.time()).replace('.', '-')
safe_name = re.sub("[^0-9a-zA-Z\\-]+", "-", name)
return f"{prefix}{safe_name}-{unique_id}"[:63]

def parse_kubernetes_datetime(k8s_datetime: str) -> datetime.datetime:
return datetime.datetime.strptime(k8s_datetime, "%Y-%m-%dT%H:%M:%S%z")
1 change: 1 addition & 0 deletions src/robusta/integrations/kubernetes/base_triggers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from functools import wraps

import hikaru
from hikaru.model import *
from ...core.model.cloud_event import *
from ...core.model.playbook_hash import playbook_hash
from ...core.model.runner_config import *
Expand Down
2 changes: 1 addition & 1 deletion src/robusta/integrations/kubernetes/custom_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import hikaru
import json
import yaml
from hikaru.model import Pod, PodList, Container, PodSpec, ObjectMeta, SecurityContext, Capabilities, Deployment, JobSpec, PodTemplateSpec
from hikaru.model import * # *-import is necessary for hikaru subclasses to work
from pydantic import BaseModel

from .api_client_utils import *
Expand Down
11 changes: 9 additions & 2 deletions src/robusta/integrations/prometheus/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from hikaru.model import Node

from ...core.model.events import BaseEvent
from ..kubernetes.custom_models import RobustaPod, RobustaDeployment
from ..kubernetes.custom_models import RobustaPod, RobustaDeployment, RobustaJob


# for parsing incoming data
Expand Down Expand Up @@ -34,7 +34,14 @@ class PrometheusEvent(BaseModel):
status: str


# everything here needs to be optional due to annoying subtleties regarding dataclass inheritance
# see explanation in the code for BaseEvent
@dataclass
class PrometheusKubernetesAlert (BaseEvent):
alert: Optional[PrometheusAlert] = None
obj: Union[RobustaPod, Node, RobustaDeployment, None] = None
alert_name: Optional[str] = None
alert_severity: Optional[str] = None
node: Optional[Node] = None
pod: Optional[RobustaPod] = None
deployment: Optional[RobustaDeployment] = None
job: Optional[RobustaJob] = None
Loading