Rework pod.status_phase check

Now submits an agent local count tagged with the five pod phases and with the different namespaces with pods. This should allow tracking aggregates of these metrics cluster-wide while retaining performance. Updated tests and description to match.
DataDog · Jul 3, 2018 · f01e1b2 · f01e1b2
1 parent 949e0d5
commit f01e1b2
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 258 deletions.
diff --git a/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py b/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py
@@ -340,18 +340,26 @@ def _trim_job_tag(self, name):
     # Labels attached: namespace, pod
     # As a message the phase=Pending|Running|Succeeded|Failed|Unknown
     # From the phase the check will update its status
-    # Also submits as a gauge metric with tags so it is visualisable over time
+    # Also submits as an aggregated count with minimal tags so it is
+    # visualisable over time per namespace and phase
     def kube_pod_status_phase(self, message, **kwargs):
         """ Phase a pod is in. """
         metric_name = self.NAMESPACE + '.pod.status_phase'
         # Will submit a service check which status is given by its phase.
         # More details about the phase in the message of the check.
         check_basename = self.NAMESPACE + '.pod.phase'
         for metric in message.metric:
-            tags = [self._label_to_tag("pod", metric.label), self._label_to_tag("namespace", metric.label),
-                    self._label_to_tag("phase", metric.label)] + self.custom_tags
-            self._condition_to_tag_check(metric, check_basename, self.pod_phase_to_status, tags=tags)
-            self.gauge(metric_name, metric.gauge.value, tags)
+            self._condition_to_tag_check(metric, check_basename, self.pod_phase_to_status,
+                                         tags=[self._label_to_tag("pod", metric.label),
+                                               self._label_to_tag("namespace", metric.label)] + self.custom_tags)
+
+            # Counts aggregated cluster-wide to avoid no-data issues on pod churn,
+            # pod granularity available in the service checks
+            tags = [
+                self._label_to_tag("namespace", metric.label),
+                self._label_to_tag("phase", metric.label)
+            ] + self.custom_tags
+            self.count(metric_name, metric.gauge.value, tags)
 
     def kube_pod_container_status_waiting_reason(self, message, **kwargs):
         metric_name = self.NAMESPACE + '.container.status_report.count.waiting'

diff --git a/kubernetes_state/metadata.csv b/kubernetes_state/metadata.csv
@@ -52,7 +52,7 @@ kubernetes_state.hpa.target_cpu,gauge,,,,Target CPU percentage of pods managed b
 kubernetes_state.hpa.desired_replicas,gauge,,,,Desired number of replicas of pods managed by this autoscaler,0,kubernetes,k8s_state.hpa.desired_replicas
 kubernetes_state.pod.ready,gauge,,,,"In association with the `condition` tag, whether the pod is ready to serve requests, e.g. `condition:true` keeps the pods that are in a ready state",1,kubernetes,k8s_state.pod.ready
 kubernetes_state.pod.scheduled,gauge,,,,Reports the status of the scheduling process for the pod with its tags,0,kubernetes,k8s_state.pod.scheduled
-kubernetes_state.pod.status_phase,gauge,,,,Submitted with a value of 1 for each pod and tagged with 'phase'; Count this metric by a distinct phase to get the number of pods in that phase,0,kubernetes,k8s_state.pod.status_phase
+kubernetes_state.pod.status_phase,gauge,,,,"To sum by `phase` to get number of pods in a given phase, and `namespace` to break this down by namespace",0,kubernetes,k8s_state.pod.status_phase
 kubernetes_state.replicaset.replicas,gauge,,,,The number of replicas per ReplicaSet,0,kubernetes,k8s_state.rs.replicas
 kubernetes_state.replicaset.fully_labeled_replicas,gauge,,,,The number of fully labeled replicas per ReplicaSet,0,kubernetes,k8s_state.rs.fully_labeled
 kubernetes_state.replicaset.replicas_ready,gauge,,,,The number of ready replicas per ReplicaSet,0,kubernetes,k8s_state.rs.replicas_rdy

diff --git a/kubernetes_state/test/test_kubernetes_state.py b/kubernetes_state/test/test_kubernetes_state.py
diff --git a/kubernetes_state/tests/test_kubernetes_state.py b/kubernetes_state/tests/test_kubernetes_state.py
@@ -46,6 +46,7 @@
     # pods
     NAMESPACE + '.pod.ready',
     NAMESPACE + '.pod.scheduled',
+    NAMESPACE + '.pod.status_phase',
     # containers
     NAMESPACE + '.container.ready',
     NAMESPACE + '.container.running',
@@ -81,6 +82,12 @@
         'condition:MemoryPressure', 'condition:DiskPressure',
         'condition:OutOfDisk', 'condition:Ready',
         'status:true', 'status:false', 'status:unknown',
+    ],
+    NAMESPACE + '.pod.status_phase': [
+        'phase:Pending', 'phase:Running',
+        'phase:Failed', 'phase:Succeeded',
+        'phase:Unknown', 'namespace:default',
+        'namespace:kube-system'
     ]
 }
 
@@ -212,6 +219,18 @@ def test_update_kube_state_metrics(aggregator, instance, check):
     aggregator.assert_metric(NAMESPACE + '.nodes.by_condition',
                              tags=['condition:Ready', 'status:unknown', 'optional:tag1'], value=0)
 
+    # Make sure we send counts for all phases to avoid no-data graphing issues
+    aggregator.assert_metric(NAMESPACE + '.pod.status_phase',
+                             tags=['namespace:default', 'phase:Pending', 'optional:tag1'], value=0)
+    aggregator.assert_metric(NAMESPACE + '.pod.status_phase',
+                             tags=['namespace:default', 'phase:Running', 'optional:tag1'], value=0)
+    aggregator.assert_metric(NAMESPACE + '.pod.status_phase',
+                             tags=['namespace:default', 'phase:Succeeded', 'optional:tag1'], value=0)
+    aggregator.assert_metric(NAMESPACE + '.pod.status_phase',
+                             tags=['namespace:default', 'phase:Failed', 'optional:tag1'], value=0)
+    aggregator.assert_metric(NAMESPACE + '.pod.status_phase',
+                             tags=['namespace:default', 'phase:Unknown', 'optional:tag1'], value=0)
+
     for metric in METRICS:
         aggregator.assert_metric(metric, hostname=HOSTNAMES.get(metric, None))
         for tag in TAGS.get(metric, []):