Skip to content

Commit

Permalink
PR changes - tag cardinality, docs, code refactor
Browse files Browse the repository at this point in the history
Reduce the cardinality of the tags for the pod.status_phase metric to
increase scalability.

Rework the pod.status_phase metadata docstring to better represent what
the metric is and how to use it.

Refactored the code to make use of existing constructs.

Adds "phase" tag to the service check portion of the pod_status_phase
prometheus metric processing. Update tests to match.
  • Loading branch information
mwhittington21 committed Jul 3, 2018
1 parent 3972f3d commit eb5ad98
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -348,13 +348,10 @@ def kube_pod_status_phase(self, message, **kwargs):
# More details about the phase in the message of the check.
check_basename = self.NAMESPACE + '.pod.phase'
for metric in message.metric:
self._condition_to_tag_check(metric, check_basename, self.pod_phase_to_status,
tags=[self._label_to_tag("pod", metric.label),
self._label_to_tag("namespace", metric.label)] + self.custom_tags)
# More verbose tagging for gauge vs service check
tags = [self._format_tag(label.name, label.value) for label in metric.label] + self.custom_tags
val = getattr(metric, METRIC_TYPES[message.type]).value
self.gauge(metric_name, val, tags)
tags = [self._label_to_tag("pod", metric.label), self._label_to_tag("namespace", metric.label),
self._label_to_tag("phase", metric.label)] + self.custom_tags
self._condition_to_tag_check(metric, check_basename, self.pod_phase_to_status, tags=tags)
self.gauge(metric_name, 1, tags)

def kube_pod_container_status_waiting_reason(self, message, **kwargs):
metric_name = self.NAMESPACE + '.container.status_report.count.waiting'
Expand Down
2 changes: 1 addition & 1 deletion kubernetes_state/metadata.csv
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ kubernetes_state.hpa.target_cpu,gauge,,,,Target CPU percentage of pods managed b
kubernetes_state.hpa.desired_replicas,gauge,,,,Desired number of replicas of pods managed by this autoscaler,0,kubernetes,k8s_state.hpa.desired_replicas
kubernetes_state.pod.ready,gauge,,,,"In association with the `condition` tag, whether the pod is ready to serve requests, e.g. `condition:true` keeps the pods that are in a ready state",1,kubernetes,k8s_state.pod.ready
kubernetes_state.pod.scheduled,gauge,,,,Reports the status of the scheduling process for the pod with its tags,0,kubernetes,k8s_state.pod.scheduled
kubernetes_state.pod.status_phase,gauge,,,,Count of current pods with their phase as a tag,0,kubernetes,k8s_state.pod.status_phase
kubernetes_state.pod.status_phase,gauge,,,,Submitted with a value of 1 for each pod and tagged with 'phase'; Count this metric by a distinct phase to get the number of pods in that phase,0,kubernetes,k8s_state.pod.status_phase
kubernetes_state.replicaset.replicas,gauge,,,,The number of replicas per ReplicaSet,0,kubernetes,k8s_state.rs.replicas
kubernetes_state.replicaset.fully_labeled_replicas,gauge,,,,The number of fully labeled replicas per ReplicaSet,0,kubernetes,k8s_state.rs.fully_labeled
kubernetes_state.replicaset.replicas_ready,gauge,,,,The number of ready replicas per ReplicaSet,0,kubernetes,k8s_state.rs.replicas_rdy
Expand Down
10 changes: 5 additions & 5 deletions kubernetes_state/test/test_kubernetes_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,15 +158,15 @@ def test__update_kube_state_metrics(self, mock_poll):
self.assertServiceCheck(NAMESPACE + '.node.network_unavailable', self.check.OK)
self.assertServiceCheck(NAMESPACE + '.node.disk_pressure', self.check.OK)
self.assertServiceCheck(NAMESPACE + '.pod.phase', self.check.OK,
tags=['namespace:default', 'pod:task-pv-pod', 'optional:tag1']) # Running
tags=['namespace:default', 'pod:task-pv-pod', 'optional:tag1', 'phase:Running']) # Running
self.assertServiceCheck(NAMESPACE + '.pod.phase', self.check.WARNING,
tags=['namespace:default', 'pod:failingtest-f585bbd4-2fsml', 'optional:tag1']) # Pending
tags=['namespace:default', 'pod:failingtest-f585bbd4-2fsml', 'optional:tag1', 'phase:Pending']) # Pending
self.assertServiceCheck(NAMESPACE + '.pod.phase', self.check.OK,
tags=['namespace:default', 'pod:hello-1509998340-k4f8q', 'optional:tag1']) # Succeeded
tags=['namespace:default', 'pod:hello-1509998340-k4f8q', 'optional:tag1', 'phase:Succeeded']) # Succeeded
self.assertServiceCheck(NAMESPACE + '.pod.phase', self.check.CRITICAL,
tags=['namespace:default', 'pod:should-run-once', 'optional:tag1']) # Failed
tags=['namespace:default', 'pod:should-run-once', 'optional:tag1', 'phase:Failed']) # Failed
self.assertServiceCheck(NAMESPACE + '.pod.phase', self.check.UNKNOWN,
tags=['namespace:default', 'pod:hello-1509998460-tzh8k', 'optional:tag1']) # Unknown
tags=['namespace:default', 'pod:hello-1509998460-tzh8k', 'optional:tag1', 'phase:Unknown']) # Unknown

# Make sure we send counts for all statuses to avoid no-data graphing issues
self.assertMetric(NAMESPACE + '.nodes.by_condition', tags=['condition:Ready', 'status:true', 'optional:tag1'], value=1)
Expand Down

0 comments on commit eb5ad98

Please sign in to comment.