diff --git a/kubernetes_state/check.py b/kubernetes_state/check.py index 0f6ff04cf59d9..88c5f458853fa 100644 --- a/kubernetes_state/check.py +++ b/kubernetes_state/check.py @@ -72,8 +72,8 @@ def __init__(self, name, init_config, agentConfig, instances=None): 'kube_pod_container_status_running': 'container.running', 'kube_pod_container_status_terminated': 'container.terminated', 'kube_pod_container_status_waiting': 'container.waiting', - 'kube_pod_container_resource_requests_nvidia_gpu_devices': 'container.gpu.resource_request', - 'kube_pod_container_resource_limits_nvidia_gpu_devices': 'container.gpu.resource_limit', + 'kube_pod_container_resource_requests_nvidia_gpu_devices': 'container.gpu.request', + 'kube_pod_container_resource_limits_nvidia_gpu_devices': 'container.gpu.limit', 'kube_pod_status_ready': 'pod.ready', 'kube_pod_status_scheduled': 'pod.scheduled', 'kube_replicaset_spec_replicas': 'replicaset.replicas_desired', @@ -133,6 +133,7 @@ def __init__(self, name, init_config, agentConfig, instances=None): 'kube_node_status_phase', # These CronJob and Job metrics need use cases to determine how do implement 'kube_cronjob_status_active', + 'kube_cronjob_status_last_schedule_time', 'kube_cronjob_spec_suspend', 'kube_cronjob_spec_starting_deadline_seconds', 'kube_job_spec_active_dealine_seconds', @@ -254,20 +255,6 @@ def kube_pod_status_phase(self, message, **kwargs): status = self.pod_phase_to_status.get(phase, self.UNKNOWN) self.service_check(check_basename + phase, status, tags=tags) - def kube_cronjob_status_last_schedule_time(self, message, **kwargs): - """ Time since the last succesful schedule """ - # Used as a metric so that one can compare the time since the last successful schedule and when the cronjob is supposed to be run - metric_name = self.NAMESPACE + '.cronjob.delay' - curr_time = time.time() - for metric in message.metric: - delay = curr_time - metric.gauge.value - if delay > 0: - tags = [self._format_tag(label.name, label.value) for label in metric.label] - self.gauge(metric_name, delay, tags) - else: - tags = [self._format_tag(label.name, label.value) for label in metric.label] - self.gauge(metric_name, 0, tags) - def kube_cronjob_next_schedule_time(self, message, **kwargs): """ Time until the next schedule """ # Used as a service check so that one can be alerted if the cronjob's next schedule is in the past @@ -277,7 +264,8 @@ def kube_cronjob_next_schedule_time(self, message, **kwargs): on_schedule = int(metric.gauge.value) - curr_time tags = [self._format_tag(label.name, label.value) for label in metric.label] if on_schedule < 0: - self.service_check(check_basename, self.CRITICAL, tags=tags) + message = "The service check scheduled at %s is %s seconds late" % (time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(int(metric.gauge.value))), on_schedule) + self.service_check(check_basename, self.CRITICAL, tags=tags, message=message) else: self.service_check(check_basename, self.OK, tags=tags) diff --git a/kubernetes_state/ci/fixtures/prometheus/protobuf.bin b/kubernetes_state/ci/fixtures/prometheus/protobuf.bin deleted file mode 100644 index 25ddee8829e67..0000000000000 Binary files a/kubernetes_state/ci/fixtures/prometheus/protobuf.bin and /dev/null differ diff --git a/kubernetes_state/metadata.csv b/kubernetes_state/metadata.csv index 1016b76af133c..26ffb450ec8d6 100644 --- a/kubernetes_state/metadata.csv +++ b/kubernetes_state/metadata.csv @@ -3,6 +3,8 @@ kubernetes_state.container.ready,gauge,,,,Whether the containers readiness check kubernetes_state.container.running,gauge,,,,Whether the container is currently in running state,0,kubernetes,k8s_state.container.running kubernetes_state.container.terminated,gauge,,,,Whether the container is currently in terminated state,0,kubernetes,k8s_state.container.term kubernetes_state.container.waiting,gauge,,,,Whether the container is currently in waiting state,0,kubernetes,k8s_state.container.wait +kubernetes_state.container.gpu.request,gauge,,,The number of requested gpu devices by a container,0,kubernetes,k8s_state.container.gpu.request +kubernetes_state.container.gpu.limit,gauge,,,The limit on gpu devices to be used by a container,0,kubernetes,k8s_state.container.gpu.limit kubernetes_state.container.restarts,gauge,,,,The number of restarts per container,-1,kubernetes,k8s_state.container.restarts kubernetes_state.container.cpu_requested,gauge,,cpu,,The number of requested cpu cores by a container,0,kubernetes,k8s_state.container.cpu_req kubernetes_state.container.memory_requested,gauge,,byte,,The number of requested memory bytes by a container,0,kubernetes,k8s_state.container.mem_req @@ -34,6 +36,9 @@ kubernetes_state.limitrange.memory.max_limit_request_ratio,gauge,,,,Maximum memo kubernetes_state.node.cpu_capacity,gauge,,cpu,,The total CPU resources of the node,0,kubernetes,k8s_state.node.cpu_capacity kubernetes_state.node.memory_capacity,gauge,,byte,,The total memory resources of the node,0,kubernetes,k8s_state.node.memory_capacity kubernetes_state.node.pods_capacity,gauge,,,,The total pod resources of the node,0,kubernetes,k8s_state.node.pods_capacity +kubernetes_state.node.gpu.cards_allocatable,gauge,,,The Nvidia GPU resources of a node that are available for scheduling,0,kubernetes,k8s_state.node.gpu.cards_allocatable +kubernetes_state.node.gpu.cards_capacity,gauge,,,The total Nvidia GPU resources of the node,0,kubernetes,k8s_state.node.gpu.cards_capacity +kubernetes_state.persistentvolumeclaim.status,gauge,,,The phase the persistent volume claim is currently in,-1,kubernetes,k8s_state.persistentvolumeclaim.status kubernetes_state.node.cpu_allocatable,gauge,,cpu,,The CPU resources of a node that are available for scheduling,0,kubernetes,k8s_state.node.cpu_allocatable kubernetes_state.node.memory_allocatable,gauge,,byte,,The memory resources of a node that are available for scheduling,0,kubernetes,k8s_state.node.memory_allocatable kubernetes_state.node.pods_allocatable,gauge,,,,The pod resources of a node that are available for scheduling,0,kubernetes,k8s_state.node.pods_allocatable diff --git a/kubernetes_state/test_kubernetes_state.py b/kubernetes_state/test_kubernetes_state.py index 796f101d38672..076569ac255b8 100644 --- a/kubernetes_state/test_kubernetes_state.py +++ b/kubernetes_state/test_kubernetes_state.py @@ -52,15 +52,13 @@ class TestKubernetesState(AgentCheckTest): NAMESPACE + '.container.memory_requested', NAMESPACE + '.container.cpu_limit', NAMESPACE + '.container.memory_limit', - NAMESPACE + '.container.gpu.resource_request', - NAMESPACE + '.container.gpu.resource_limit', + NAMESPACE + '.container.gpu.request', + NAMESPACE + '.container.gpu.limit', # replicasets NAMESPACE + '.replicaset.replicas', NAMESPACE + '.replicaset.fully_labeled_replicas', NAMESPACE + '.replicaset.replicas_ready', NAMESPACE + '.replicaset.replicas_desired', - # cronjob - NAMESPACE + '.cronjob.delay', # persistentvolume claim NAMESPACE + '.persistentvolumeclaim.status', ]