Skip to content

Commit

Permalink
updating per @xvello's review
Browse files Browse the repository at this point in the history
  • Loading branch information
CharlyF committed Nov 8, 2017
1 parent ed5b8c6 commit da26a02
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 21 deletions.
22 changes: 5 additions & 17 deletions kubernetes_state/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ def __init__(self, name, init_config, agentConfig, instances=None):
'kube_pod_container_status_running': 'container.running',
'kube_pod_container_status_terminated': 'container.terminated',
'kube_pod_container_status_waiting': 'container.waiting',
'kube_pod_container_resource_requests_nvidia_gpu_devices': 'container.gpu.resource_request',
'kube_pod_container_resource_limits_nvidia_gpu_devices': 'container.gpu.resource_limit',
'kube_pod_container_resource_requests_nvidia_gpu_devices': 'container.gpu.request',
'kube_pod_container_resource_limits_nvidia_gpu_devices': 'container.gpu.limit',
'kube_pod_status_ready': 'pod.ready',
'kube_pod_status_scheduled': 'pod.scheduled',
'kube_replicaset_spec_replicas': 'replicaset.replicas_desired',
Expand Down Expand Up @@ -133,6 +133,7 @@ def __init__(self, name, init_config, agentConfig, instances=None):
'kube_node_status_phase',
# These CronJob and Job metrics need use cases to determine how do implement
'kube_cronjob_status_active',
'kube_cronjob_status_last_schedule_time',
'kube_cronjob_spec_suspend',
'kube_cronjob_spec_starting_deadline_seconds',
'kube_job_spec_active_dealine_seconds',
Expand Down Expand Up @@ -254,20 +255,6 @@ def kube_pod_status_phase(self, message, **kwargs):
status = self.pod_phase_to_status.get(phase, self.UNKNOWN)
self.service_check(check_basename + phase, status, tags=tags)

def kube_cronjob_status_last_schedule_time(self, message, **kwargs):
""" Time since the last succesful schedule """
# Used as a metric so that one can compare the time since the last successful schedule and when the cronjob is supposed to be run
metric_name = self.NAMESPACE + '.cronjob.delay'
curr_time = time.time()
for metric in message.metric:
delay = curr_time - metric.gauge.value
if delay > 0:
tags = [self._format_tag(label.name, label.value) for label in metric.label]
self.gauge(metric_name, delay, tags)
else:
tags = [self._format_tag(label.name, label.value) for label in metric.label]
self.gauge(metric_name, 0, tags)

def kube_cronjob_next_schedule_time(self, message, **kwargs):
""" Time until the next schedule """
# Used as a service check so that one can be alerted if the cronjob's next schedule is in the past
Expand All @@ -277,7 +264,8 @@ def kube_cronjob_next_schedule_time(self, message, **kwargs):
on_schedule = int(metric.gauge.value) - curr_time
tags = [self._format_tag(label.name, label.value) for label in metric.label]
if on_schedule < 0:
self.service_check(check_basename, self.CRITICAL, tags=tags)
message = "The service check scheduled at %s is %s seconds late" % (time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(int(metric.gauge.value))), on_schedule)
self.service_check(check_basename, self.CRITICAL, tags=tags, message=message)
else:
self.service_check(check_basename, self.OK, tags=tags)

Expand Down
Binary file removed kubernetes_state/ci/fixtures/prometheus/protobuf.bin
Binary file not shown.
5 changes: 5 additions & 0 deletions kubernetes_state/metadata.csv
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ kubernetes_state.container.ready,gauge,,,,Whether the containers readiness check
kubernetes_state.container.running,gauge,,,,Whether the container is currently in running state,0,kubernetes,k8s_state.container.running
kubernetes_state.container.terminated,gauge,,,,Whether the container is currently in terminated state,0,kubernetes,k8s_state.container.term
kubernetes_state.container.waiting,gauge,,,,Whether the container is currently in waiting state,0,kubernetes,k8s_state.container.wait
kubernetes_state.container.gpu.request,gauge,,,The number of requested gpu devices by a container,0,kubernetes,k8s_state.container.gpu.request
kubernetes_state.container.gpu.limit,gauge,,,The limit on gpu devices to be used by a container,0,kubernetes,k8s_state.container.gpu.limit
kubernetes_state.container.restarts,gauge,,,,The number of restarts per container,-1,kubernetes,k8s_state.container.restarts
kubernetes_state.container.cpu_requested,gauge,,cpu,,The number of requested cpu cores by a container,0,kubernetes,k8s_state.container.cpu_req
kubernetes_state.container.memory_requested,gauge,,byte,,The number of requested memory bytes by a container,0,kubernetes,k8s_state.container.mem_req
Expand Down Expand Up @@ -34,6 +36,9 @@ kubernetes_state.limitrange.memory.max_limit_request_ratio,gauge,,,,Maximum memo
kubernetes_state.node.cpu_capacity,gauge,,cpu,,The total CPU resources of the node,0,kubernetes,k8s_state.node.cpu_capacity
kubernetes_state.node.memory_capacity,gauge,,byte,,The total memory resources of the node,0,kubernetes,k8s_state.node.memory_capacity
kubernetes_state.node.pods_capacity,gauge,,,,The total pod resources of the node,0,kubernetes,k8s_state.node.pods_capacity
kubernetes_state.node.gpu.cards_allocatable,gauge,,,The Nvidia GPU resources of a node that are available for scheduling,0,kubernetes,k8s_state.node.gpu.cards_allocatable
kubernetes_state.node.gpu.cards_capacity,gauge,,,The total Nvidia GPU resources of the node,0,kubernetes,k8s_state.node.gpu.cards_capacity
kubernetes_state.persistentvolumeclaim.status,gauge,,,The phase the persistent volume claim is currently in,-1,kubernetes,k8s_state.persistentvolumeclaim.status
kubernetes_state.node.cpu_allocatable,gauge,,cpu,,The CPU resources of a node that are available for scheduling,0,kubernetes,k8s_state.node.cpu_allocatable
kubernetes_state.node.memory_allocatable,gauge,,byte,,The memory resources of a node that are available for scheduling,0,kubernetes,k8s_state.node.memory_allocatable
kubernetes_state.node.pods_allocatable,gauge,,,,The pod resources of a node that are available for scheduling,0,kubernetes,k8s_state.node.pods_allocatable
Expand Down
6 changes: 2 additions & 4 deletions kubernetes_state/test_kubernetes_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,15 +52,13 @@ class TestKubernetesState(AgentCheckTest):
NAMESPACE + '.container.memory_requested',
NAMESPACE + '.container.cpu_limit',
NAMESPACE + '.container.memory_limit',
NAMESPACE + '.container.gpu.resource_request',
NAMESPACE + '.container.gpu.resource_limit',
NAMESPACE + '.container.gpu.request',
NAMESPACE + '.container.gpu.limit',
# replicasets
NAMESPACE + '.replicaset.replicas',
NAMESPACE + '.replicaset.fully_labeled_replicas',
NAMESPACE + '.replicaset.replicas_ready',
NAMESPACE + '.replicaset.replicas_desired',
# cronjob
NAMESPACE + '.cronjob.delay',
# persistentvolume claim
NAMESPACE + '.persistentvolumeclaim.status',
]
Expand Down

0 comments on commit da26a02

Please sign in to comment.