From 3984042339b64b06f83579960cb3a7411dd6df4b Mon Sep 17 00:00:00 2001 From: Matt Whittington Date: Fri, 18 May 2018 18:42:48 +1000 Subject: [PATCH 1/5] Add kube_pod_status_phase as gauge It was previously only a status check which is not helpful if you want to visualize the status of your pods over time. --- .../kubernetes_state/kubernetes_state.py | 6 + kubernetes_state/metadata.csv | 1 + .../test/test_kubernetes_state.py | 252 ++++++++++++++++++ 3 files changed, 259 insertions(+) create mode 100644 kubernetes_state/test/test_kubernetes_state.py diff --git a/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py b/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py index 9a21ffb4e7032..7c285c4ed8ed8 100644 --- a/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py +++ b/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py @@ -342,8 +342,10 @@ def _trim_job_tag(self, name): # Labels attached: namespace, pod # As a message the phase=Pending|Running|Succeeded|Failed|Unknown # From the phase the check will update its status + # Also submits as a gauge metric with tags so it is visualisable over time def kube_pod_status_phase(self, message, **kwargs): """ Phase a pod is in. """ + metric_name = self.NAMESPACE + '.pod.status_phase' # Will submit a service check which status is given by its phase. # More details about the phase in the message of the check. check_basename = self.NAMESPACE + '.pod.phase' @@ -351,6 +353,10 @@ def kube_pod_status_phase(self, message, **kwargs): self._condition_to_tag_check(metric, check_basename, self.pod_phase_to_status, tags=[self._label_to_tag("pod", metric.label), self._label_to_tag("namespace", metric.label)] + self.custom_tags) + # More verbose tagging for gauge vs service check + tags = [self._format_tag(label.name, label.value) for label in metric.label] + self.custom_tags + val = getattr(metric, METRIC_TYPES[message.type]).value + self.gauge(metric_name, val, tags) def kube_pod_container_status_waiting_reason(self, message, **kwargs): metric_name = self.NAMESPACE + '.container.status_report.count.waiting' diff --git a/kubernetes_state/metadata.csv b/kubernetes_state/metadata.csv index b7a7be51c605e..90210ce95e1a8 100644 --- a/kubernetes_state/metadata.csv +++ b/kubernetes_state/metadata.csv @@ -52,6 +52,7 @@ kubernetes_state.hpa.target_cpu,gauge,,,,Target CPU percentage of pods managed b kubernetes_state.hpa.desired_replicas,gauge,,,,Desired number of replicas of pods managed by this autoscaler,0,kubernetes,k8s_state.hpa.desired_replicas kubernetes_state.pod.ready,gauge,,,,"In association with the `condition` tag, whether the pod is ready to serve requests, e.g. `condition:true` keeps the pods that are in a ready state",1,kubernetes,k8s_state.pod.ready kubernetes_state.pod.scheduled,gauge,,,,Reports the status of the scheduling process for the pod with its tags,0,kubernetes,k8s_state.pod.scheduled +kubernetes_state.pod.status_phase,gauge,,,,Count of current pods with their phase as a tag,0,kubernetes,k8s_state.pod.status_phase kubernetes_state.replicaset.replicas,gauge,,,,The number of replicas per ReplicaSet,0,kubernetes,k8s_state.rs.replicas kubernetes_state.replicaset.fully_labeled_replicas,gauge,,,,The number of fully labeled replicas per ReplicaSet,0,kubernetes,k8s_state.rs.fully_labeled kubernetes_state.replicaset.replicas_ready,gauge,,,,The number of ready replicas per ReplicaSet,0,kubernetes,k8s_state.rs.replicas_rdy diff --git a/kubernetes_state/test/test_kubernetes_state.py b/kubernetes_state/test/test_kubernetes_state.py new file mode 100644 index 0000000000000..05c945fcf8e68 --- /dev/null +++ b/kubernetes_state/test/test_kubernetes_state.py @@ -0,0 +1,252 @@ +# (C) Datadog, Inc. 2016 +# All rights reserved +# Licensed under Simplified BSD License (see LICENSE) + +# stdlib +import mock +import os + +# 3p +from nose.plugins.attrib import attr + +# project +from tests.checks.common import AgentCheckTest + +NAMESPACE = 'kubernetes_state' + +class MockResponse: + """ + MockResponse is used to simulate the object requests.Response commonly returned by requests.get + """ + + def __init__(self, content, content_type): + self.content = content + self.headers = {'Content-Type': content_type} + + def iter_lines(self, **_): + for elt in self.content.split("\n"): + yield elt + + def close(self): + pass + + +@attr(requires='kubernetes_state') +class TestKubernetesState(AgentCheckTest): + CHECK_NAME = 'kubernetes_state' + + METRICS = [ + # nodes + NAMESPACE + '.node.cpu_capacity', + NAMESPACE + '.node.memory_capacity', + NAMESPACE + '.node.pods_capacity', + NAMESPACE + '.node.cpu_allocatable', + NAMESPACE + '.node.memory_allocatable', + NAMESPACE + '.node.pods_allocatable', + NAMESPACE + '.node.gpu.cards_capacity', + NAMESPACE + '.node.gpu.cards_allocatable', + NAMESPACE + '.nodes.by_condition', + # deployments + NAMESPACE + '.deployment.replicas', + NAMESPACE + '.deployment.replicas_available', + NAMESPACE + '.deployment.replicas_unavailable', + NAMESPACE + '.deployment.replicas_updated', + NAMESPACE + '.deployment.replicas_desired', + NAMESPACE + '.deployment.paused', + NAMESPACE + '.deployment.rollingupdate.max_unavailable', + # daemonsets + NAMESPACE + '.daemonset.scheduled', + NAMESPACE + '.daemonset.misscheduled', + NAMESPACE + '.daemonset.desired', + # hpa + NAMESPACE + '.hpa.min_replicas', + NAMESPACE + '.hpa.max_replicas', + NAMESPACE + '.hpa.desired_replicas', + NAMESPACE + '.hpa.current_replicas', + # pods + NAMESPACE + '.pod.ready', + NAMESPACE + '.pod.scheduled', + NAMESPACE + '.pod.status_phase', + # containers + NAMESPACE + '.container.ready', + NAMESPACE + '.container.running', + NAMESPACE + '.container.terminated', + NAMESPACE + '.container.status_report.count.terminated', + NAMESPACE + '.container.waiting', + NAMESPACE + '.container.status_report.count.waiting', + NAMESPACE + '.container.restarts', + NAMESPACE + '.container.cpu_requested', + NAMESPACE + '.container.memory_requested', + NAMESPACE + '.container.cpu_limit', + NAMESPACE + '.container.memory_limit', + NAMESPACE + '.container.gpu.request', + NAMESPACE + '.container.gpu.limit', + # replicasets + NAMESPACE + '.replicaset.replicas', + NAMESPACE + '.replicaset.fully_labeled_replicas', + NAMESPACE + '.replicaset.replicas_ready', + NAMESPACE + '.replicaset.replicas_desired', + # persistentvolume claim + NAMESPACE + '.persistentvolumeclaim.status', + # statefulset + NAMESPACE + '.statefulset.replicas', + NAMESPACE + '.statefulset.replicas_current', + NAMESPACE + '.statefulset.replicas_ready', + NAMESPACE + '.statefulset.replicas_updated', + ] + + TAGS = { + NAMESPACE + '.pod.ready': ['node:minikube'], + NAMESPACE + '.pod.scheduled': ['node:minikube'], + NAMESPACE + '.nodes.by_condition': [ + 'condition:MemoryPressure', 'condition:DiskPressure', + 'condition:OutOfDisk', 'condition:Ready', + 'status:true', 'status:false', 'status:unknown', + ] + } + + JOINED_METRICS = { + NAMESPACE + '.deployment.replicas': ['label_addonmanager_kubernetes_io_mode:Reconcile','deployment:kube-dns'], + NAMESPACE + '.deployment.replicas_available': ['label_addonmanager_kubernetes_io_mode:Reconcile','deployment:kube-dns'], + NAMESPACE + '.deployment.replicas_unavailable': ['label_addonmanager_kubernetes_io_mode:Reconcile','deployment:kube-dns'], + NAMESPACE + '.deployment.replicas_updated': ['label_addonmanager_kubernetes_io_mode:Reconcile','deployment:kube-dns'], + NAMESPACE + '.deployment.replicas_desired': ['label_addonmanager_kubernetes_io_mode:Reconcile','deployment:kube-dns'], + NAMESPACE + '.deployment.paused': ['label_addonmanager_kubernetes_io_mode:Reconcile','deployment:kube-dns'], + NAMESPACE + '.deployment.rollingupdate.max_unavailable': ['label_addonmanager_kubernetes_io_mode:Reconcile','deployment:kube-dns'], + } + + HOSTNAMES = { + NAMESPACE + '.pod.ready': 'minikube', + NAMESPACE + '.pod.scheduled': 'minikube' + } + + ZERO_METRICS = [ + NAMESPACE + '.deployment.replicas_unavailable', + NAMESPACE + '.deployment.paused', + NAMESPACE + '.daemonset.misscheduled', + NAMESPACE + '.container.terminated', + NAMESPACE + '.container.waiting', + ] + + def assertMetricNotAllZeros(self, metric_name): + for mname, ts, val, mdata in self.metrics: + if mname == metric_name: + if val != 0: + return True + raise AssertionError("All metrics named %s have 0 value." % metric_name) + + @mock.patch('checks.prometheus_check.PrometheusCheck.poll') + def test__update_kube_state_metrics(self, mock_poll): + f_name = os.path.join(os.path.dirname(__file__), 'ci', 'fixtures', 'prometheus', 'prometheus.txt') + with open(f_name, 'rb') as f: + mock_poll.return_value = MockResponse(f.read(), 'text/plain') + + config = { + 'instances': [{ + 'host': 'foo', + 'kube_state_url': 'http://foo', + 'tags': ['optional:tag1'] + }] + } + + # run check twice to have pod/node mapping + self.run_check_twice(config) + + self.assertServiceCheck(NAMESPACE + '.node.ready', self.check.OK) + self.assertServiceCheck(NAMESPACE + '.node.out_of_disk', self.check.OK) + self.assertServiceCheck(NAMESPACE + '.node.memory_pressure', self.check.OK) + self.assertServiceCheck(NAMESPACE + '.node.network_unavailable', self.check.OK) + self.assertServiceCheck(NAMESPACE + '.node.disk_pressure', self.check.OK) + self.assertServiceCheck(NAMESPACE + '.pod.phase', self.check.OK, + tags=['namespace:default', 'pod:task-pv-pod', 'optional:tag1']) # Running + self.assertServiceCheck(NAMESPACE + '.pod.phase', self.check.WARNING, + tags=['namespace:default', 'pod:failingtest-f585bbd4-2fsml', 'optional:tag1']) # Pending + self.assertServiceCheck(NAMESPACE + '.pod.phase', self.check.OK, + tags=['namespace:default', 'pod:hello-1509998340-k4f8q', 'optional:tag1']) # Succeeded + self.assertServiceCheck(NAMESPACE + '.pod.phase', self.check.CRITICAL, + tags=['namespace:default', 'pod:should-run-once', 'optional:tag1']) # Failed + self.assertServiceCheck(NAMESPACE + '.pod.phase', self.check.UNKNOWN, + tags=['namespace:default', 'pod:hello-1509998460-tzh8k', 'optional:tag1']) # Unknown + + # Make sure we send counts for all statuses to avoid no-data graphing issues + self.assertMetric(NAMESPACE + '.nodes.by_condition', tags=['condition:Ready', 'status:true', 'optional:tag1'], value=1) + self.assertMetric(NAMESPACE + '.nodes.by_condition', tags=['condition:Ready', 'status:false', 'optional:tag1'], value=0) + self.assertMetric(NAMESPACE + '.nodes.by_condition', tags=['condition:Ready', 'status:unknown', 'optional:tag1'], value=0) + + for metric in self.METRICS: + self.assertMetric( + metric, + hostname=self.HOSTNAMES.get(metric, None) + ) + tags = self.TAGS.get(metric, None) + if tags: + for tag in tags: + self.assertMetricTag(metric, tag) + if metric not in self.ZERO_METRICS: + self.assertMetricNotAllZeros(metric) + + self.assert_resourcequota() + + @mock.patch('checks.prometheus_check.PrometheusCheck.poll') + def test__update_kube_state_metrics_v040(self, mock_poll): + f_name = os.path.join(os.path.dirname(__file__), 'ci', 'fixtures', 'prometheus', 'prometheus.txt') + with open(f_name, 'rb') as f: + mock_poll.return_value = MockResponse(f.read(), 'text/plain') + + config = { + 'instances': [{ + 'host': 'foo', + 'kube_state_url': 'http://foo', + }] + } + + # run check twice to have pod/node mapping + self.run_check_twice(config) + + self.assertServiceCheck(NAMESPACE + '.node.ready', self.check.OK) + self.assertServiceCheck(NAMESPACE + '.node.out_of_disk', self.check.OK) + + for metric in self.METRICS: + if not metric.startswith(NAMESPACE + '.hpa'): + self.assertMetric(metric) + + self.assert_resourcequota() + + @mock.patch('checks.prometheus_check.PrometheusCheck.poll') + def test__join_custom_labels(self, mock_poll): + f_name = os.path.join(os.path.dirname(__file__), 'ci', 'fixtures', 'prometheus', 'prometheus.txt') + with open(f_name, 'rb') as f: + mock_poll.return_value = MockResponse(f.read(), 'text/plain') + + config = { + 'instances': [{ + 'host': 'foo', + 'kube_state_url': 'http://foo', + 'label_joins': { + 'kube_deployment_labels': { + 'label_to_match': 'deployment', + 'labels_to_get':['label_addonmanager_kubernetes_io_mode'] + } + }, + }] + } + # run check twice to have the labels join mapping. + self.run_check_twice(config) + for metric in self.METRICS: + self.assertMetric( + metric, + hostname=self.HOSTNAMES.get(metric, None) + ) + tags = self.JOINED_METRICS.get(metric, None) + if tags: + for tag in tags: + self.assertMetricTag(metric, tag) + if metric not in self.ZERO_METRICS: + self.assertMetricNotAllZeros(metric) + + def assert_resourcequota(self): + """ The metric name is created dynamically so we just check some exist. """ + for m in self.metrics: + if 'kubernetes_state.resourcequota.' in m[0]: + return True + return False From 9159e73d0542c844cae0f8e3c5852ef5d0e6f191 Mon Sep 17 00:00:00 2001 From: Matt Whittington Date: Tue, 22 May 2018 17:22:14 +1000 Subject: [PATCH 2/5] PR changes - tag cardinality, docs, code refactor Reduce the cardinality of the tags for the pod.status_phase metric to increase scalability. Rework the pod.status_phase metadata docstring to better represent what the metric is and how to use it. Refactored the code to make use of existing constructs. Adds "phase" tag to the service check portion of the pod_status_phase prometheus metric processing. Update tests to match. --- .../kubernetes_state/kubernetes_state.py | 11 ++++------- kubernetes_state/metadata.csv | 2 +- kubernetes_state/test/test_kubernetes_state.py | 10 +++++----- 3 files changed, 10 insertions(+), 13 deletions(-) diff --git a/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py b/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py index 7c285c4ed8ed8..d44e65e6c98d6 100644 --- a/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py +++ b/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py @@ -350,13 +350,10 @@ def kube_pod_status_phase(self, message, **kwargs): # More details about the phase in the message of the check. check_basename = self.NAMESPACE + '.pod.phase' for metric in message.metric: - self._condition_to_tag_check(metric, check_basename, self.pod_phase_to_status, - tags=[self._label_to_tag("pod", metric.label), - self._label_to_tag("namespace", metric.label)] + self.custom_tags) - # More verbose tagging for gauge vs service check - tags = [self._format_tag(label.name, label.value) for label in metric.label] + self.custom_tags - val = getattr(metric, METRIC_TYPES[message.type]).value - self.gauge(metric_name, val, tags) + tags = [self._label_to_tag("pod", metric.label), self._label_to_tag("namespace", metric.label), + self._label_to_tag("phase", metric.label)] + self.custom_tags + self._condition_to_tag_check(metric, check_basename, self.pod_phase_to_status, tags=tags) + self.gauge(metric_name, 1, tags) def kube_pod_container_status_waiting_reason(self, message, **kwargs): metric_name = self.NAMESPACE + '.container.status_report.count.waiting' diff --git a/kubernetes_state/metadata.csv b/kubernetes_state/metadata.csv index 90210ce95e1a8..b7112b1da2cde 100644 --- a/kubernetes_state/metadata.csv +++ b/kubernetes_state/metadata.csv @@ -52,7 +52,7 @@ kubernetes_state.hpa.target_cpu,gauge,,,,Target CPU percentage of pods managed b kubernetes_state.hpa.desired_replicas,gauge,,,,Desired number of replicas of pods managed by this autoscaler,0,kubernetes,k8s_state.hpa.desired_replicas kubernetes_state.pod.ready,gauge,,,,"In association with the `condition` tag, whether the pod is ready to serve requests, e.g. `condition:true` keeps the pods that are in a ready state",1,kubernetes,k8s_state.pod.ready kubernetes_state.pod.scheduled,gauge,,,,Reports the status of the scheduling process for the pod with its tags,0,kubernetes,k8s_state.pod.scheduled -kubernetes_state.pod.status_phase,gauge,,,,Count of current pods with their phase as a tag,0,kubernetes,k8s_state.pod.status_phase +kubernetes_state.pod.status_phase,gauge,,,,Submitted with a value of 1 for each pod and tagged with 'phase'; Count this metric by a distinct phase to get the number of pods in that phase,0,kubernetes,k8s_state.pod.status_phase kubernetes_state.replicaset.replicas,gauge,,,,The number of replicas per ReplicaSet,0,kubernetes,k8s_state.rs.replicas kubernetes_state.replicaset.fully_labeled_replicas,gauge,,,,The number of fully labeled replicas per ReplicaSet,0,kubernetes,k8s_state.rs.fully_labeled kubernetes_state.replicaset.replicas_ready,gauge,,,,The number of ready replicas per ReplicaSet,0,kubernetes,k8s_state.rs.replicas_rdy diff --git a/kubernetes_state/test/test_kubernetes_state.py b/kubernetes_state/test/test_kubernetes_state.py index 05c945fcf8e68..60e20d965d3e9 100644 --- a/kubernetes_state/test/test_kubernetes_state.py +++ b/kubernetes_state/test/test_kubernetes_state.py @@ -158,15 +158,15 @@ def test__update_kube_state_metrics(self, mock_poll): self.assertServiceCheck(NAMESPACE + '.node.network_unavailable', self.check.OK) self.assertServiceCheck(NAMESPACE + '.node.disk_pressure', self.check.OK) self.assertServiceCheck(NAMESPACE + '.pod.phase', self.check.OK, - tags=['namespace:default', 'pod:task-pv-pod', 'optional:tag1']) # Running + tags=['namespace:default', 'pod:task-pv-pod', 'optional:tag1', 'phase:Running']) # Running self.assertServiceCheck(NAMESPACE + '.pod.phase', self.check.WARNING, - tags=['namespace:default', 'pod:failingtest-f585bbd4-2fsml', 'optional:tag1']) # Pending + tags=['namespace:default', 'pod:failingtest-f585bbd4-2fsml', 'optional:tag1', 'phase:Pending']) # Pending self.assertServiceCheck(NAMESPACE + '.pod.phase', self.check.OK, - tags=['namespace:default', 'pod:hello-1509998340-k4f8q', 'optional:tag1']) # Succeeded + tags=['namespace:default', 'pod:hello-1509998340-k4f8q', 'optional:tag1', 'phase:Succeeded']) # Succeeded self.assertServiceCheck(NAMESPACE + '.pod.phase', self.check.CRITICAL, - tags=['namespace:default', 'pod:should-run-once', 'optional:tag1']) # Failed + tags=['namespace:default', 'pod:should-run-once', 'optional:tag1', 'phase:Failed']) # Failed self.assertServiceCheck(NAMESPACE + '.pod.phase', self.check.UNKNOWN, - tags=['namespace:default', 'pod:hello-1509998460-tzh8k', 'optional:tag1']) # Unknown + tags=['namespace:default', 'pod:hello-1509998460-tzh8k', 'optional:tag1', 'phase:Unknown']) # Unknown # Make sure we send counts for all statuses to avoid no-data graphing issues self.assertMetric(NAMESPACE + '.nodes.by_condition', tags=['condition:Ready', 'status:true', 'optional:tag1'], value=1) From 6d9284837f9f0e68d4a5f1f030d3ec0a21a15019 Mon Sep 17 00:00:00 2001 From: Matt Whittington Date: Mon, 4 Jun 2018 15:42:56 +1000 Subject: [PATCH 3/5] Make pod.status_phase metric report value from prometheus --- .../datadog_checks/kubernetes_state/kubernetes_state.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py b/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py index d44e65e6c98d6..f702e5afb6369 100644 --- a/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py +++ b/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py @@ -353,7 +353,7 @@ def kube_pod_status_phase(self, message, **kwargs): tags = [self._label_to_tag("pod", metric.label), self._label_to_tag("namespace", metric.label), self._label_to_tag("phase", metric.label)] + self.custom_tags self._condition_to_tag_check(metric, check_basename, self.pod_phase_to_status, tags=tags) - self.gauge(metric_name, 1, tags) + self.gauge(metric_name, metric.gauge.value, tags) def kube_pod_container_status_waiting_reason(self, message, **kwargs): metric_name = self.NAMESPACE + '.container.status_report.count.waiting' From ce2c3c07ffb38cc21b7376e0e0ab38c64a149b7e Mon Sep 17 00:00:00 2001 From: Matt Whittington Date: Tue, 3 Jul 2018 15:40:37 +1000 Subject: [PATCH 4/5] Rework pod.status_phase check Now submits an agent local count tagged with the five pod phases and with the different namespaces with pods. This should allow tracking aggregates of these metrics cluster-wide while retaining performance. Updated tests and description to match. --- .../kubernetes_state/kubernetes_state.py | 18 +- kubernetes_state/metadata.csv | 2 +- .../test/test_kubernetes_state.py | 252 ------------------ .../tests/test_kubernetes_state.py | 19 ++ 4 files changed, 33 insertions(+), 258 deletions(-) delete mode 100644 kubernetes_state/test/test_kubernetes_state.py diff --git a/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py b/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py index f702e5afb6369..0733d4c83bc2e 100644 --- a/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py +++ b/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py @@ -342,7 +342,8 @@ def _trim_job_tag(self, name): # Labels attached: namespace, pod # As a message the phase=Pending|Running|Succeeded|Failed|Unknown # From the phase the check will update its status - # Also submits as a gauge metric with tags so it is visualisable over time + # Also submits as an aggregated count with minimal tags so it is + # visualisable over time per namespace and phase def kube_pod_status_phase(self, message, **kwargs): """ Phase a pod is in. """ metric_name = self.NAMESPACE + '.pod.status_phase' @@ -350,10 +351,17 @@ def kube_pod_status_phase(self, message, **kwargs): # More details about the phase in the message of the check. check_basename = self.NAMESPACE + '.pod.phase' for metric in message.metric: - tags = [self._label_to_tag("pod", metric.label), self._label_to_tag("namespace", metric.label), - self._label_to_tag("phase", metric.label)] + self.custom_tags - self._condition_to_tag_check(metric, check_basename, self.pod_phase_to_status, tags=tags) - self.gauge(metric_name, metric.gauge.value, tags) + self._condition_to_tag_check(metric, check_basename, self.pod_phase_to_status, + tags=[self._label_to_tag("pod", metric.label), + self._label_to_tag("namespace", metric.label)] + self.custom_tags) + + # Counts aggregated cluster-wide to avoid no-data issues on pod churn, + # pod granularity available in the service checks + tags = [ + self._label_to_tag("namespace", metric.label), + self._label_to_tag("phase", metric.label) + ] + self.custom_tags + self.count(metric_name, metric.gauge.value, tags) def kube_pod_container_status_waiting_reason(self, message, **kwargs): metric_name = self.NAMESPACE + '.container.status_report.count.waiting' diff --git a/kubernetes_state/metadata.csv b/kubernetes_state/metadata.csv index b7112b1da2cde..d2cc9e34c89a0 100644 --- a/kubernetes_state/metadata.csv +++ b/kubernetes_state/metadata.csv @@ -52,7 +52,7 @@ kubernetes_state.hpa.target_cpu,gauge,,,,Target CPU percentage of pods managed b kubernetes_state.hpa.desired_replicas,gauge,,,,Desired number of replicas of pods managed by this autoscaler,0,kubernetes,k8s_state.hpa.desired_replicas kubernetes_state.pod.ready,gauge,,,,"In association with the `condition` tag, whether the pod is ready to serve requests, e.g. `condition:true` keeps the pods that are in a ready state",1,kubernetes,k8s_state.pod.ready kubernetes_state.pod.scheduled,gauge,,,,Reports the status of the scheduling process for the pod with its tags,0,kubernetes,k8s_state.pod.scheduled -kubernetes_state.pod.status_phase,gauge,,,,Submitted with a value of 1 for each pod and tagged with 'phase'; Count this metric by a distinct phase to get the number of pods in that phase,0,kubernetes,k8s_state.pod.status_phase +kubernetes_state.pod.status_phase,gauge,,,,"To sum by `phase` to get number of pods in a given phase, and `namespace` to break this down by namespace",0,kubernetes,k8s_state.pod.status_phase kubernetes_state.replicaset.replicas,gauge,,,,The number of replicas per ReplicaSet,0,kubernetes,k8s_state.rs.replicas kubernetes_state.replicaset.fully_labeled_replicas,gauge,,,,The number of fully labeled replicas per ReplicaSet,0,kubernetes,k8s_state.rs.fully_labeled kubernetes_state.replicaset.replicas_ready,gauge,,,,The number of ready replicas per ReplicaSet,0,kubernetes,k8s_state.rs.replicas_rdy diff --git a/kubernetes_state/test/test_kubernetes_state.py b/kubernetes_state/test/test_kubernetes_state.py deleted file mode 100644 index 60e20d965d3e9..0000000000000 --- a/kubernetes_state/test/test_kubernetes_state.py +++ /dev/null @@ -1,252 +0,0 @@ -# (C) Datadog, Inc. 2016 -# All rights reserved -# Licensed under Simplified BSD License (see LICENSE) - -# stdlib -import mock -import os - -# 3p -from nose.plugins.attrib import attr - -# project -from tests.checks.common import AgentCheckTest - -NAMESPACE = 'kubernetes_state' - -class MockResponse: - """ - MockResponse is used to simulate the object requests.Response commonly returned by requests.get - """ - - def __init__(self, content, content_type): - self.content = content - self.headers = {'Content-Type': content_type} - - def iter_lines(self, **_): - for elt in self.content.split("\n"): - yield elt - - def close(self): - pass - - -@attr(requires='kubernetes_state') -class TestKubernetesState(AgentCheckTest): - CHECK_NAME = 'kubernetes_state' - - METRICS = [ - # nodes - NAMESPACE + '.node.cpu_capacity', - NAMESPACE + '.node.memory_capacity', - NAMESPACE + '.node.pods_capacity', - NAMESPACE + '.node.cpu_allocatable', - NAMESPACE + '.node.memory_allocatable', - NAMESPACE + '.node.pods_allocatable', - NAMESPACE + '.node.gpu.cards_capacity', - NAMESPACE + '.node.gpu.cards_allocatable', - NAMESPACE + '.nodes.by_condition', - # deployments - NAMESPACE + '.deployment.replicas', - NAMESPACE + '.deployment.replicas_available', - NAMESPACE + '.deployment.replicas_unavailable', - NAMESPACE + '.deployment.replicas_updated', - NAMESPACE + '.deployment.replicas_desired', - NAMESPACE + '.deployment.paused', - NAMESPACE + '.deployment.rollingupdate.max_unavailable', - # daemonsets - NAMESPACE + '.daemonset.scheduled', - NAMESPACE + '.daemonset.misscheduled', - NAMESPACE + '.daemonset.desired', - # hpa - NAMESPACE + '.hpa.min_replicas', - NAMESPACE + '.hpa.max_replicas', - NAMESPACE + '.hpa.desired_replicas', - NAMESPACE + '.hpa.current_replicas', - # pods - NAMESPACE + '.pod.ready', - NAMESPACE + '.pod.scheduled', - NAMESPACE + '.pod.status_phase', - # containers - NAMESPACE + '.container.ready', - NAMESPACE + '.container.running', - NAMESPACE + '.container.terminated', - NAMESPACE + '.container.status_report.count.terminated', - NAMESPACE + '.container.waiting', - NAMESPACE + '.container.status_report.count.waiting', - NAMESPACE + '.container.restarts', - NAMESPACE + '.container.cpu_requested', - NAMESPACE + '.container.memory_requested', - NAMESPACE + '.container.cpu_limit', - NAMESPACE + '.container.memory_limit', - NAMESPACE + '.container.gpu.request', - NAMESPACE + '.container.gpu.limit', - # replicasets - NAMESPACE + '.replicaset.replicas', - NAMESPACE + '.replicaset.fully_labeled_replicas', - NAMESPACE + '.replicaset.replicas_ready', - NAMESPACE + '.replicaset.replicas_desired', - # persistentvolume claim - NAMESPACE + '.persistentvolumeclaim.status', - # statefulset - NAMESPACE + '.statefulset.replicas', - NAMESPACE + '.statefulset.replicas_current', - NAMESPACE + '.statefulset.replicas_ready', - NAMESPACE + '.statefulset.replicas_updated', - ] - - TAGS = { - NAMESPACE + '.pod.ready': ['node:minikube'], - NAMESPACE + '.pod.scheduled': ['node:minikube'], - NAMESPACE + '.nodes.by_condition': [ - 'condition:MemoryPressure', 'condition:DiskPressure', - 'condition:OutOfDisk', 'condition:Ready', - 'status:true', 'status:false', 'status:unknown', - ] - } - - JOINED_METRICS = { - NAMESPACE + '.deployment.replicas': ['label_addonmanager_kubernetes_io_mode:Reconcile','deployment:kube-dns'], - NAMESPACE + '.deployment.replicas_available': ['label_addonmanager_kubernetes_io_mode:Reconcile','deployment:kube-dns'], - NAMESPACE + '.deployment.replicas_unavailable': ['label_addonmanager_kubernetes_io_mode:Reconcile','deployment:kube-dns'], - NAMESPACE + '.deployment.replicas_updated': ['label_addonmanager_kubernetes_io_mode:Reconcile','deployment:kube-dns'], - NAMESPACE + '.deployment.replicas_desired': ['label_addonmanager_kubernetes_io_mode:Reconcile','deployment:kube-dns'], - NAMESPACE + '.deployment.paused': ['label_addonmanager_kubernetes_io_mode:Reconcile','deployment:kube-dns'], - NAMESPACE + '.deployment.rollingupdate.max_unavailable': ['label_addonmanager_kubernetes_io_mode:Reconcile','deployment:kube-dns'], - } - - HOSTNAMES = { - NAMESPACE + '.pod.ready': 'minikube', - NAMESPACE + '.pod.scheduled': 'minikube' - } - - ZERO_METRICS = [ - NAMESPACE + '.deployment.replicas_unavailable', - NAMESPACE + '.deployment.paused', - NAMESPACE + '.daemonset.misscheduled', - NAMESPACE + '.container.terminated', - NAMESPACE + '.container.waiting', - ] - - def assertMetricNotAllZeros(self, metric_name): - for mname, ts, val, mdata in self.metrics: - if mname == metric_name: - if val != 0: - return True - raise AssertionError("All metrics named %s have 0 value." % metric_name) - - @mock.patch('checks.prometheus_check.PrometheusCheck.poll') - def test__update_kube_state_metrics(self, mock_poll): - f_name = os.path.join(os.path.dirname(__file__), 'ci', 'fixtures', 'prometheus', 'prometheus.txt') - with open(f_name, 'rb') as f: - mock_poll.return_value = MockResponse(f.read(), 'text/plain') - - config = { - 'instances': [{ - 'host': 'foo', - 'kube_state_url': 'http://foo', - 'tags': ['optional:tag1'] - }] - } - - # run check twice to have pod/node mapping - self.run_check_twice(config) - - self.assertServiceCheck(NAMESPACE + '.node.ready', self.check.OK) - self.assertServiceCheck(NAMESPACE + '.node.out_of_disk', self.check.OK) - self.assertServiceCheck(NAMESPACE + '.node.memory_pressure', self.check.OK) - self.assertServiceCheck(NAMESPACE + '.node.network_unavailable', self.check.OK) - self.assertServiceCheck(NAMESPACE + '.node.disk_pressure', self.check.OK) - self.assertServiceCheck(NAMESPACE + '.pod.phase', self.check.OK, - tags=['namespace:default', 'pod:task-pv-pod', 'optional:tag1', 'phase:Running']) # Running - self.assertServiceCheck(NAMESPACE + '.pod.phase', self.check.WARNING, - tags=['namespace:default', 'pod:failingtest-f585bbd4-2fsml', 'optional:tag1', 'phase:Pending']) # Pending - self.assertServiceCheck(NAMESPACE + '.pod.phase', self.check.OK, - tags=['namespace:default', 'pod:hello-1509998340-k4f8q', 'optional:tag1', 'phase:Succeeded']) # Succeeded - self.assertServiceCheck(NAMESPACE + '.pod.phase', self.check.CRITICAL, - tags=['namespace:default', 'pod:should-run-once', 'optional:tag1', 'phase:Failed']) # Failed - self.assertServiceCheck(NAMESPACE + '.pod.phase', self.check.UNKNOWN, - tags=['namespace:default', 'pod:hello-1509998460-tzh8k', 'optional:tag1', 'phase:Unknown']) # Unknown - - # Make sure we send counts for all statuses to avoid no-data graphing issues - self.assertMetric(NAMESPACE + '.nodes.by_condition', tags=['condition:Ready', 'status:true', 'optional:tag1'], value=1) - self.assertMetric(NAMESPACE + '.nodes.by_condition', tags=['condition:Ready', 'status:false', 'optional:tag1'], value=0) - self.assertMetric(NAMESPACE + '.nodes.by_condition', tags=['condition:Ready', 'status:unknown', 'optional:tag1'], value=0) - - for metric in self.METRICS: - self.assertMetric( - metric, - hostname=self.HOSTNAMES.get(metric, None) - ) - tags = self.TAGS.get(metric, None) - if tags: - for tag in tags: - self.assertMetricTag(metric, tag) - if metric not in self.ZERO_METRICS: - self.assertMetricNotAllZeros(metric) - - self.assert_resourcequota() - - @mock.patch('checks.prometheus_check.PrometheusCheck.poll') - def test__update_kube_state_metrics_v040(self, mock_poll): - f_name = os.path.join(os.path.dirname(__file__), 'ci', 'fixtures', 'prometheus', 'prometheus.txt') - with open(f_name, 'rb') as f: - mock_poll.return_value = MockResponse(f.read(), 'text/plain') - - config = { - 'instances': [{ - 'host': 'foo', - 'kube_state_url': 'http://foo', - }] - } - - # run check twice to have pod/node mapping - self.run_check_twice(config) - - self.assertServiceCheck(NAMESPACE + '.node.ready', self.check.OK) - self.assertServiceCheck(NAMESPACE + '.node.out_of_disk', self.check.OK) - - for metric in self.METRICS: - if not metric.startswith(NAMESPACE + '.hpa'): - self.assertMetric(metric) - - self.assert_resourcequota() - - @mock.patch('checks.prometheus_check.PrometheusCheck.poll') - def test__join_custom_labels(self, mock_poll): - f_name = os.path.join(os.path.dirname(__file__), 'ci', 'fixtures', 'prometheus', 'prometheus.txt') - with open(f_name, 'rb') as f: - mock_poll.return_value = MockResponse(f.read(), 'text/plain') - - config = { - 'instances': [{ - 'host': 'foo', - 'kube_state_url': 'http://foo', - 'label_joins': { - 'kube_deployment_labels': { - 'label_to_match': 'deployment', - 'labels_to_get':['label_addonmanager_kubernetes_io_mode'] - } - }, - }] - } - # run check twice to have the labels join mapping. - self.run_check_twice(config) - for metric in self.METRICS: - self.assertMetric( - metric, - hostname=self.HOSTNAMES.get(metric, None) - ) - tags = self.JOINED_METRICS.get(metric, None) - if tags: - for tag in tags: - self.assertMetricTag(metric, tag) - if metric not in self.ZERO_METRICS: - self.assertMetricNotAllZeros(metric) - - def assert_resourcequota(self): - """ The metric name is created dynamically so we just check some exist. """ - for m in self.metrics: - if 'kubernetes_state.resourcequota.' in m[0]: - return True - return False diff --git a/kubernetes_state/tests/test_kubernetes_state.py b/kubernetes_state/tests/test_kubernetes_state.py index caeb0b3440c12..5e131e675d55e 100644 --- a/kubernetes_state/tests/test_kubernetes_state.py +++ b/kubernetes_state/tests/test_kubernetes_state.py @@ -46,6 +46,7 @@ # pods NAMESPACE + '.pod.ready', NAMESPACE + '.pod.scheduled', + NAMESPACE + '.pod.status_phase', # containers NAMESPACE + '.container.ready', NAMESPACE + '.container.running', @@ -82,6 +83,12 @@ 'condition:OutOfDisk', 'condition:Ready', 'status:true', 'status:false', 'status:unknown', ], + NAMESPACE + '.pod.status_phase': [ + 'phase:Pending', 'phase:Running', + 'phase:Failed', 'phase:Succeeded', + 'phase:Unknown', 'namespace:default', + 'namespace:kube-system' + ], NAMESPACE + '.container.status_report.count.waiting': [ 'reason:CrashLoopBackoff', 'reason:ErrImagePull', @@ -217,6 +224,18 @@ def test_update_kube_state_metrics(aggregator, instance, check): aggregator.assert_metric(NAMESPACE + '.nodes.by_condition', tags=['condition:Ready', 'status:unknown', 'optional:tag1'], value=0) + # Make sure we send counts for all phases to avoid no-data graphing issues + aggregator.assert_metric(NAMESPACE + '.pod.status_phase', + tags=['namespace:default', 'phase:Pending', 'optional:tag1'], value=0) + aggregator.assert_metric(NAMESPACE + '.pod.status_phase', + tags=['namespace:default', 'phase:Running', 'optional:tag1'], value=0) + aggregator.assert_metric(NAMESPACE + '.pod.status_phase', + tags=['namespace:default', 'phase:Succeeded', 'optional:tag1'], value=0) + aggregator.assert_metric(NAMESPACE + '.pod.status_phase', + tags=['namespace:default', 'phase:Failed', 'optional:tag1'], value=0) + aggregator.assert_metric(NAMESPACE + '.pod.status_phase', + tags=['namespace:default', 'phase:Unknown', 'optional:tag1'], value=0) + for metric in METRICS: aggregator.assert_metric(metric, hostname=HOSTNAMES.get(metric, None)) for tag in TAGS.get(metric, []): From 22933ee752332a7ba6fd9a5189e173a85c9c696c Mon Sep 17 00:00:00 2001 From: Matt Whittington Date: Thu, 5 Jul 2018 01:46:28 +1000 Subject: [PATCH 5/5] Update pod.status_phase metric submission to aggregate client side counters --- .../kubernetes_state/kubernetes_state.py | 9 +++++++-- kubernetes_state/tests/test_kubernetes_state.py | 10 +++++----- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py b/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py index 0733d4c83bc2e..2ffb387082dc7 100644 --- a/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py +++ b/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py @@ -4,7 +4,7 @@ import re import time -from collections import defaultdict +from collections import defaultdict, Counter try: # Agent5 compatibility layer @@ -350,6 +350,8 @@ def kube_pod_status_phase(self, message, **kwargs): # Will submit a service check which status is given by its phase. # More details about the phase in the message of the check. check_basename = self.NAMESPACE + '.pod.phase' + status_phase_counter = Counter() + for metric in message.metric: self._condition_to_tag_check(metric, check_basename, self.pod_phase_to_status, tags=[self._label_to_tag("pod", metric.label), @@ -361,7 +363,10 @@ def kube_pod_status_phase(self, message, **kwargs): self._label_to_tag("namespace", metric.label), self._label_to_tag("phase", metric.label) ] + self.custom_tags - self.count(metric_name, metric.gauge.value, tags) + status_phase_counter[tuple(sorted(tags))] += metric.gauge.value + + for tags, count in status_phase_counter.iteritems(): + self.gauge(metric_name, count, tags=list(tags)) def kube_pod_container_status_waiting_reason(self, message, **kwargs): metric_name = self.NAMESPACE + '.container.status_report.count.waiting' diff --git a/kubernetes_state/tests/test_kubernetes_state.py b/kubernetes_state/tests/test_kubernetes_state.py index 5e131e675d55e..d47873e9cf563 100644 --- a/kubernetes_state/tests/test_kubernetes_state.py +++ b/kubernetes_state/tests/test_kubernetes_state.py @@ -226,15 +226,15 @@ def test_update_kube_state_metrics(aggregator, instance, check): # Make sure we send counts for all phases to avoid no-data graphing issues aggregator.assert_metric(NAMESPACE + '.pod.status_phase', - tags=['namespace:default', 'phase:Pending', 'optional:tag1'], value=0) + tags=['namespace:default', 'phase:Pending', 'optional:tag1'], value=1) aggregator.assert_metric(NAMESPACE + '.pod.status_phase', - tags=['namespace:default', 'phase:Running', 'optional:tag1'], value=0) + tags=['namespace:default', 'phase:Running', 'optional:tag1'], value=3) aggregator.assert_metric(NAMESPACE + '.pod.status_phase', - tags=['namespace:default', 'phase:Succeeded', 'optional:tag1'], value=0) + tags=['namespace:default', 'phase:Succeeded', 'optional:tag1'], value=2) aggregator.assert_metric(NAMESPACE + '.pod.status_phase', - tags=['namespace:default', 'phase:Failed', 'optional:tag1'], value=0) + tags=['namespace:default', 'phase:Failed', 'optional:tag1'], value=2) aggregator.assert_metric(NAMESPACE + '.pod.status_phase', - tags=['namespace:default', 'phase:Unknown', 'optional:tag1'], value=0) + tags=['namespace:default', 'phase:Unknown', 'optional:tag1'], value=1) for metric in METRICS: aggregator.assert_metric(metric, hostname=HOSTNAMES.get(metric, None))