Skip to content

Commit

Permalink
Rework pod.status_phase check
Browse files Browse the repository at this point in the history
Now submits an agent local count tagged with the five pod phases and
with the different namespaces with pods. This should allow tracking
aggregates of these metrics cluster-wide while retaining performance.

Updated tests and description to match.
  • Loading branch information
mwhittington21 committed Jul 3, 2018
1 parent 949e0d5 commit f01e1b2
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 258 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -340,18 +340,26 @@ def _trim_job_tag(self, name):
# Labels attached: namespace, pod
# As a message the phase=Pending|Running|Succeeded|Failed|Unknown
# From the phase the check will update its status
# Also submits as a gauge metric with tags so it is visualisable over time
# Also submits as an aggregated count with minimal tags so it is
# visualisable over time per namespace and phase
def kube_pod_status_phase(self, message, **kwargs):
""" Phase a pod is in. """
metric_name = self.NAMESPACE + '.pod.status_phase'
# Will submit a service check which status is given by its phase.
# More details about the phase in the message of the check.
check_basename = self.NAMESPACE + '.pod.phase'
for metric in message.metric:
tags = [self._label_to_tag("pod", metric.label), self._label_to_tag("namespace", metric.label),
self._label_to_tag("phase", metric.label)] + self.custom_tags
self._condition_to_tag_check(metric, check_basename, self.pod_phase_to_status, tags=tags)
self.gauge(metric_name, metric.gauge.value, tags)
self._condition_to_tag_check(metric, check_basename, self.pod_phase_to_status,
tags=[self._label_to_tag("pod", metric.label),
self._label_to_tag("namespace", metric.label)] + self.custom_tags)

# Counts aggregated cluster-wide to avoid no-data issues on pod churn,
# pod granularity available in the service checks
tags = [
self._label_to_tag("namespace", metric.label),
self._label_to_tag("phase", metric.label)
] + self.custom_tags
self.count(metric_name, metric.gauge.value, tags)

def kube_pod_container_status_waiting_reason(self, message, **kwargs):
metric_name = self.NAMESPACE + '.container.status_report.count.waiting'
Expand Down
2 changes: 1 addition & 1 deletion kubernetes_state/metadata.csv
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ kubernetes_state.hpa.target_cpu,gauge,,,,Target CPU percentage of pods managed b
kubernetes_state.hpa.desired_replicas,gauge,,,,Desired number of replicas of pods managed by this autoscaler,0,kubernetes,k8s_state.hpa.desired_replicas
kubernetes_state.pod.ready,gauge,,,,"In association with the `condition` tag, whether the pod is ready to serve requests, e.g. `condition:true` keeps the pods that are in a ready state",1,kubernetes,k8s_state.pod.ready
kubernetes_state.pod.scheduled,gauge,,,,Reports the status of the scheduling process for the pod with its tags,0,kubernetes,k8s_state.pod.scheduled
kubernetes_state.pod.status_phase,gauge,,,,Submitted with a value of 1 for each pod and tagged with 'phase'; Count this metric by a distinct phase to get the number of pods in that phase,0,kubernetes,k8s_state.pod.status_phase
kubernetes_state.pod.status_phase,gauge,,,,"To sum by `phase` to get number of pods in a given phase, and `namespace` to break this down by namespace",0,kubernetes,k8s_state.pod.status_phase
kubernetes_state.replicaset.replicas,gauge,,,,The number of replicas per ReplicaSet,0,kubernetes,k8s_state.rs.replicas
kubernetes_state.replicaset.fully_labeled_replicas,gauge,,,,The number of fully labeled replicas per ReplicaSet,0,kubernetes,k8s_state.rs.fully_labeled
kubernetes_state.replicaset.replicas_ready,gauge,,,,The number of ready replicas per ReplicaSet,0,kubernetes,k8s_state.rs.replicas_rdy
Expand Down
252 changes: 0 additions & 252 deletions kubernetes_state/test/test_kubernetes_state.py

This file was deleted.

19 changes: 19 additions & 0 deletions kubernetes_state/tests/test_kubernetes_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
# pods
NAMESPACE + '.pod.ready',
NAMESPACE + '.pod.scheduled',
NAMESPACE + '.pod.status_phase',
# containers
NAMESPACE + '.container.ready',
NAMESPACE + '.container.running',
Expand Down Expand Up @@ -81,6 +82,12 @@
'condition:MemoryPressure', 'condition:DiskPressure',
'condition:OutOfDisk', 'condition:Ready',
'status:true', 'status:false', 'status:unknown',
],
NAMESPACE + '.pod.status_phase': [
'phase:Pending', 'phase:Running',
'phase:Failed', 'phase:Succeeded',
'phase:Unknown', 'namespace:default',
'namespace:kube-system'
]
}

Expand Down Expand Up @@ -212,6 +219,18 @@ def test_update_kube_state_metrics(aggregator, instance, check):
aggregator.assert_metric(NAMESPACE + '.nodes.by_condition',
tags=['condition:Ready', 'status:unknown', 'optional:tag1'], value=0)

# Make sure we send counts for all phases to avoid no-data graphing issues
aggregator.assert_metric(NAMESPACE + '.pod.status_phase',
tags=['namespace:default', 'phase:Pending', 'optional:tag1'], value=0)
aggregator.assert_metric(NAMESPACE + '.pod.status_phase',
tags=['namespace:default', 'phase:Running', 'optional:tag1'], value=0)
aggregator.assert_metric(NAMESPACE + '.pod.status_phase',
tags=['namespace:default', 'phase:Succeeded', 'optional:tag1'], value=0)
aggregator.assert_metric(NAMESPACE + '.pod.status_phase',
tags=['namespace:default', 'phase:Failed', 'optional:tag1'], value=0)
aggregator.assert_metric(NAMESPACE + '.pod.status_phase',
tags=['namespace:default', 'phase:Unknown', 'optional:tag1'], value=0)

for metric in METRICS:
aggregator.assert_metric(metric, hostname=HOSTNAMES.get(metric, None))
for tag in TAGS.get(metric, []):
Expand Down

0 comments on commit f01e1b2

Please sign in to comment.