From f01e1b22b85d0c99eb06fee1d602383c72e7413f Mon Sep 17 00:00:00 2001 From: Matt Whittington Date: Tue, 3 Jul 2018 15:40:37 +1000 Subject: [PATCH] Rework pod.status_phase check Now submits an agent local count tagged with the five pod phases and with the different namespaces with pods. This should allow tracking aggregates of these metrics cluster-wide while retaining performance. Updated tests and description to match. --- .../kubernetes_state/kubernetes_state.py | 18 +- kubernetes_state/metadata.csv | 2 +- .../test/test_kubernetes_state.py | 252 ------------------ .../tests/test_kubernetes_state.py | 19 ++ 4 files changed, 33 insertions(+), 258 deletions(-) delete mode 100644 kubernetes_state/test/test_kubernetes_state.py diff --git a/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py b/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py index 3f478743460c9f..326b65f52dce97 100644 --- a/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py +++ b/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py @@ -340,7 +340,8 @@ def _trim_job_tag(self, name): # Labels attached: namespace, pod # As a message the phase=Pending|Running|Succeeded|Failed|Unknown # From the phase the check will update its status - # Also submits as a gauge metric with tags so it is visualisable over time + # Also submits as an aggregated count with minimal tags so it is + # visualisable over time per namespace and phase def kube_pod_status_phase(self, message, **kwargs): """ Phase a pod is in. """ metric_name = self.NAMESPACE + '.pod.status_phase' @@ -348,10 +349,17 @@ def kube_pod_status_phase(self, message, **kwargs): # More details about the phase in the message of the check. check_basename = self.NAMESPACE + '.pod.phase' for metric in message.metric: - tags = [self._label_to_tag("pod", metric.label), self._label_to_tag("namespace", metric.label), - self._label_to_tag("phase", metric.label)] + self.custom_tags - self._condition_to_tag_check(metric, check_basename, self.pod_phase_to_status, tags=tags) - self.gauge(metric_name, metric.gauge.value, tags) + self._condition_to_tag_check(metric, check_basename, self.pod_phase_to_status, + tags=[self._label_to_tag("pod", metric.label), + self._label_to_tag("namespace", metric.label)] + self.custom_tags) + + # Counts aggregated cluster-wide to avoid no-data issues on pod churn, + # pod granularity available in the service checks + tags = [ + self._label_to_tag("namespace", metric.label), + self._label_to_tag("phase", metric.label) + ] + self.custom_tags + self.count(metric_name, metric.gauge.value, tags) def kube_pod_container_status_waiting_reason(self, message, **kwargs): metric_name = self.NAMESPACE + '.container.status_report.count.waiting' diff --git a/kubernetes_state/metadata.csv b/kubernetes_state/metadata.csv index b7112b1da2cde8..d2cc9e34c89a05 100644 --- a/kubernetes_state/metadata.csv +++ b/kubernetes_state/metadata.csv @@ -52,7 +52,7 @@ kubernetes_state.hpa.target_cpu,gauge,,,,Target CPU percentage of pods managed b kubernetes_state.hpa.desired_replicas,gauge,,,,Desired number of replicas of pods managed by this autoscaler,0,kubernetes,k8s_state.hpa.desired_replicas kubernetes_state.pod.ready,gauge,,,,"In association with the `condition` tag, whether the pod is ready to serve requests, e.g. `condition:true` keeps the pods that are in a ready state",1,kubernetes,k8s_state.pod.ready kubernetes_state.pod.scheduled,gauge,,,,Reports the status of the scheduling process for the pod with its tags,0,kubernetes,k8s_state.pod.scheduled -kubernetes_state.pod.status_phase,gauge,,,,Submitted with a value of 1 for each pod and tagged with 'phase'; Count this metric by a distinct phase to get the number of pods in that phase,0,kubernetes,k8s_state.pod.status_phase +kubernetes_state.pod.status_phase,gauge,,,,"To sum by `phase` to get number of pods in a given phase, and `namespace` to break this down by namespace",0,kubernetes,k8s_state.pod.status_phase kubernetes_state.replicaset.replicas,gauge,,,,The number of replicas per ReplicaSet,0,kubernetes,k8s_state.rs.replicas kubernetes_state.replicaset.fully_labeled_replicas,gauge,,,,The number of fully labeled replicas per ReplicaSet,0,kubernetes,k8s_state.rs.fully_labeled kubernetes_state.replicaset.replicas_ready,gauge,,,,The number of ready replicas per ReplicaSet,0,kubernetes,k8s_state.rs.replicas_rdy diff --git a/kubernetes_state/test/test_kubernetes_state.py b/kubernetes_state/test/test_kubernetes_state.py deleted file mode 100644 index 60e20d965d3e92..00000000000000 --- a/kubernetes_state/test/test_kubernetes_state.py +++ /dev/null @@ -1,252 +0,0 @@ -# (C) Datadog, Inc. 2016 -# All rights reserved -# Licensed under Simplified BSD License (see LICENSE) - -# stdlib -import mock -import os - -# 3p -from nose.plugins.attrib import attr - -# project -from tests.checks.common import AgentCheckTest - -NAMESPACE = 'kubernetes_state' - -class MockResponse: - """ - MockResponse is used to simulate the object requests.Response commonly returned by requests.get - """ - - def __init__(self, content, content_type): - self.content = content - self.headers = {'Content-Type': content_type} - - def iter_lines(self, **_): - for elt in self.content.split("\n"): - yield elt - - def close(self): - pass - - -@attr(requires='kubernetes_state') -class TestKubernetesState(AgentCheckTest): - CHECK_NAME = 'kubernetes_state' - - METRICS = [ - # nodes - NAMESPACE + '.node.cpu_capacity', - NAMESPACE + '.node.memory_capacity', - NAMESPACE + '.node.pods_capacity', - NAMESPACE + '.node.cpu_allocatable', - NAMESPACE + '.node.memory_allocatable', - NAMESPACE + '.node.pods_allocatable', - NAMESPACE + '.node.gpu.cards_capacity', - NAMESPACE + '.node.gpu.cards_allocatable', - NAMESPACE + '.nodes.by_condition', - # deployments - NAMESPACE + '.deployment.replicas', - NAMESPACE + '.deployment.replicas_available', - NAMESPACE + '.deployment.replicas_unavailable', - NAMESPACE + '.deployment.replicas_updated', - NAMESPACE + '.deployment.replicas_desired', - NAMESPACE + '.deployment.paused', - NAMESPACE + '.deployment.rollingupdate.max_unavailable', - # daemonsets - NAMESPACE + '.daemonset.scheduled', - NAMESPACE + '.daemonset.misscheduled', - NAMESPACE + '.daemonset.desired', - # hpa - NAMESPACE + '.hpa.min_replicas', - NAMESPACE + '.hpa.max_replicas', - NAMESPACE + '.hpa.desired_replicas', - NAMESPACE + '.hpa.current_replicas', - # pods - NAMESPACE + '.pod.ready', - NAMESPACE + '.pod.scheduled', - NAMESPACE + '.pod.status_phase', - # containers - NAMESPACE + '.container.ready', - NAMESPACE + '.container.running', - NAMESPACE + '.container.terminated', - NAMESPACE + '.container.status_report.count.terminated', - NAMESPACE + '.container.waiting', - NAMESPACE + '.container.status_report.count.waiting', - NAMESPACE + '.container.restarts', - NAMESPACE + '.container.cpu_requested', - NAMESPACE + '.container.memory_requested', - NAMESPACE + '.container.cpu_limit', - NAMESPACE + '.container.memory_limit', - NAMESPACE + '.container.gpu.request', - NAMESPACE + '.container.gpu.limit', - # replicasets - NAMESPACE + '.replicaset.replicas', - NAMESPACE + '.replicaset.fully_labeled_replicas', - NAMESPACE + '.replicaset.replicas_ready', - NAMESPACE + '.replicaset.replicas_desired', - # persistentvolume claim - NAMESPACE + '.persistentvolumeclaim.status', - # statefulset - NAMESPACE + '.statefulset.replicas', - NAMESPACE + '.statefulset.replicas_current', - NAMESPACE + '.statefulset.replicas_ready', - NAMESPACE + '.statefulset.replicas_updated', - ] - - TAGS = { - NAMESPACE + '.pod.ready': ['node:minikube'], - NAMESPACE + '.pod.scheduled': ['node:minikube'], - NAMESPACE + '.nodes.by_condition': [ - 'condition:MemoryPressure', 'condition:DiskPressure', - 'condition:OutOfDisk', 'condition:Ready', - 'status:true', 'status:false', 'status:unknown', - ] - } - - JOINED_METRICS = { - NAMESPACE + '.deployment.replicas': ['label_addonmanager_kubernetes_io_mode:Reconcile','deployment:kube-dns'], - NAMESPACE + '.deployment.replicas_available': ['label_addonmanager_kubernetes_io_mode:Reconcile','deployment:kube-dns'], - NAMESPACE + '.deployment.replicas_unavailable': ['label_addonmanager_kubernetes_io_mode:Reconcile','deployment:kube-dns'], - NAMESPACE + '.deployment.replicas_updated': ['label_addonmanager_kubernetes_io_mode:Reconcile','deployment:kube-dns'], - NAMESPACE + '.deployment.replicas_desired': ['label_addonmanager_kubernetes_io_mode:Reconcile','deployment:kube-dns'], - NAMESPACE + '.deployment.paused': ['label_addonmanager_kubernetes_io_mode:Reconcile','deployment:kube-dns'], - NAMESPACE + '.deployment.rollingupdate.max_unavailable': ['label_addonmanager_kubernetes_io_mode:Reconcile','deployment:kube-dns'], - } - - HOSTNAMES = { - NAMESPACE + '.pod.ready': 'minikube', - NAMESPACE + '.pod.scheduled': 'minikube' - } - - ZERO_METRICS = [ - NAMESPACE + '.deployment.replicas_unavailable', - NAMESPACE + '.deployment.paused', - NAMESPACE + '.daemonset.misscheduled', - NAMESPACE + '.container.terminated', - NAMESPACE + '.container.waiting', - ] - - def assertMetricNotAllZeros(self, metric_name): - for mname, ts, val, mdata in self.metrics: - if mname == metric_name: - if val != 0: - return True - raise AssertionError("All metrics named %s have 0 value." % metric_name) - - @mock.patch('checks.prometheus_check.PrometheusCheck.poll') - def test__update_kube_state_metrics(self, mock_poll): - f_name = os.path.join(os.path.dirname(__file__), 'ci', 'fixtures', 'prometheus', 'prometheus.txt') - with open(f_name, 'rb') as f: - mock_poll.return_value = MockResponse(f.read(), 'text/plain') - - config = { - 'instances': [{ - 'host': 'foo', - 'kube_state_url': 'http://foo', - 'tags': ['optional:tag1'] - }] - } - - # run check twice to have pod/node mapping - self.run_check_twice(config) - - self.assertServiceCheck(NAMESPACE + '.node.ready', self.check.OK) - self.assertServiceCheck(NAMESPACE + '.node.out_of_disk', self.check.OK) - self.assertServiceCheck(NAMESPACE + '.node.memory_pressure', self.check.OK) - self.assertServiceCheck(NAMESPACE + '.node.network_unavailable', self.check.OK) - self.assertServiceCheck(NAMESPACE + '.node.disk_pressure', self.check.OK) - self.assertServiceCheck(NAMESPACE + '.pod.phase', self.check.OK, - tags=['namespace:default', 'pod:task-pv-pod', 'optional:tag1', 'phase:Running']) # Running - self.assertServiceCheck(NAMESPACE + '.pod.phase', self.check.WARNING, - tags=['namespace:default', 'pod:failingtest-f585bbd4-2fsml', 'optional:tag1', 'phase:Pending']) # Pending - self.assertServiceCheck(NAMESPACE + '.pod.phase', self.check.OK, - tags=['namespace:default', 'pod:hello-1509998340-k4f8q', 'optional:tag1', 'phase:Succeeded']) # Succeeded - self.assertServiceCheck(NAMESPACE + '.pod.phase', self.check.CRITICAL, - tags=['namespace:default', 'pod:should-run-once', 'optional:tag1', 'phase:Failed']) # Failed - self.assertServiceCheck(NAMESPACE + '.pod.phase', self.check.UNKNOWN, - tags=['namespace:default', 'pod:hello-1509998460-tzh8k', 'optional:tag1', 'phase:Unknown']) # Unknown - - # Make sure we send counts for all statuses to avoid no-data graphing issues - self.assertMetric(NAMESPACE + '.nodes.by_condition', tags=['condition:Ready', 'status:true', 'optional:tag1'], value=1) - self.assertMetric(NAMESPACE + '.nodes.by_condition', tags=['condition:Ready', 'status:false', 'optional:tag1'], value=0) - self.assertMetric(NAMESPACE + '.nodes.by_condition', tags=['condition:Ready', 'status:unknown', 'optional:tag1'], value=0) - - for metric in self.METRICS: - self.assertMetric( - metric, - hostname=self.HOSTNAMES.get(metric, None) - ) - tags = self.TAGS.get(metric, None) - if tags: - for tag in tags: - self.assertMetricTag(metric, tag) - if metric not in self.ZERO_METRICS: - self.assertMetricNotAllZeros(metric) - - self.assert_resourcequota() - - @mock.patch('checks.prometheus_check.PrometheusCheck.poll') - def test__update_kube_state_metrics_v040(self, mock_poll): - f_name = os.path.join(os.path.dirname(__file__), 'ci', 'fixtures', 'prometheus', 'prometheus.txt') - with open(f_name, 'rb') as f: - mock_poll.return_value = MockResponse(f.read(), 'text/plain') - - config = { - 'instances': [{ - 'host': 'foo', - 'kube_state_url': 'http://foo', - }] - } - - # run check twice to have pod/node mapping - self.run_check_twice(config) - - self.assertServiceCheck(NAMESPACE + '.node.ready', self.check.OK) - self.assertServiceCheck(NAMESPACE + '.node.out_of_disk', self.check.OK) - - for metric in self.METRICS: - if not metric.startswith(NAMESPACE + '.hpa'): - self.assertMetric(metric) - - self.assert_resourcequota() - - @mock.patch('checks.prometheus_check.PrometheusCheck.poll') - def test__join_custom_labels(self, mock_poll): - f_name = os.path.join(os.path.dirname(__file__), 'ci', 'fixtures', 'prometheus', 'prometheus.txt') - with open(f_name, 'rb') as f: - mock_poll.return_value = MockResponse(f.read(), 'text/plain') - - config = { - 'instances': [{ - 'host': 'foo', - 'kube_state_url': 'http://foo', - 'label_joins': { - 'kube_deployment_labels': { - 'label_to_match': 'deployment', - 'labels_to_get':['label_addonmanager_kubernetes_io_mode'] - } - }, - }] - } - # run check twice to have the labels join mapping. - self.run_check_twice(config) - for metric in self.METRICS: - self.assertMetric( - metric, - hostname=self.HOSTNAMES.get(metric, None) - ) - tags = self.JOINED_METRICS.get(metric, None) - if tags: - for tag in tags: - self.assertMetricTag(metric, tag) - if metric not in self.ZERO_METRICS: - self.assertMetricNotAllZeros(metric) - - def assert_resourcequota(self): - """ The metric name is created dynamically so we just check some exist. """ - for m in self.metrics: - if 'kubernetes_state.resourcequota.' in m[0]: - return True - return False diff --git a/kubernetes_state/tests/test_kubernetes_state.py b/kubernetes_state/tests/test_kubernetes_state.py index b13c4f461cad31..eb71ed4668e79b 100644 --- a/kubernetes_state/tests/test_kubernetes_state.py +++ b/kubernetes_state/tests/test_kubernetes_state.py @@ -46,6 +46,7 @@ # pods NAMESPACE + '.pod.ready', NAMESPACE + '.pod.scheduled', + NAMESPACE + '.pod.status_phase', # containers NAMESPACE + '.container.ready', NAMESPACE + '.container.running', @@ -81,6 +82,12 @@ 'condition:MemoryPressure', 'condition:DiskPressure', 'condition:OutOfDisk', 'condition:Ready', 'status:true', 'status:false', 'status:unknown', + ], + NAMESPACE + '.pod.status_phase': [ + 'phase:Pending', 'phase:Running', + 'phase:Failed', 'phase:Succeeded', + 'phase:Unknown', 'namespace:default', + 'namespace:kube-system' ] } @@ -212,6 +219,18 @@ def test_update_kube_state_metrics(aggregator, instance, check): aggregator.assert_metric(NAMESPACE + '.nodes.by_condition', tags=['condition:Ready', 'status:unknown', 'optional:tag1'], value=0) + # Make sure we send counts for all phases to avoid no-data graphing issues + aggregator.assert_metric(NAMESPACE + '.pod.status_phase', + tags=['namespace:default', 'phase:Pending', 'optional:tag1'], value=0) + aggregator.assert_metric(NAMESPACE + '.pod.status_phase', + tags=['namespace:default', 'phase:Running', 'optional:tag1'], value=0) + aggregator.assert_metric(NAMESPACE + '.pod.status_phase', + tags=['namespace:default', 'phase:Succeeded', 'optional:tag1'], value=0) + aggregator.assert_metric(NAMESPACE + '.pod.status_phase', + tags=['namespace:default', 'phase:Failed', 'optional:tag1'], value=0) + aggregator.assert_metric(NAMESPACE + '.pod.status_phase', + tags=['namespace:default', 'phase:Unknown', 'optional:tag1'], value=0) + for metric in METRICS: aggregator.assert_metric(metric, hostname=HOSTNAMES.get(metric, None)) for tag in TAGS.get(metric, []):