forked from stack72/nagios-elasticsearch
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcheck_es_jvm_usage.py
91 lines (77 loc) · 3.49 KB
/
check_es_jvm_usage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/python
from nagioscheck import NagiosCheck, UsageError
from nagioscheck import PerformanceMetric, Status
import urllib2
import optparse
try:
import json
except ImportError:
import simplejson as json
class ESJVMHealthCheck(NagiosCheck):
def __init__(self):
NagiosCheck.__init__(self)
self.add_option('H', 'host', 'host', 'The cluster to check')
self.add_option('P', 'port', 'port', 'The ES port - defaults to 9200')
self.add_option('C', 'critical_threshold', 'critical_threshold',
'The level at which we throw a CRITICAL alert'
' - defaults to 97% of the JVM setting')
self.add_option('W', 'warning_threshold', 'warning_threshold',
'The level at which we throw a WARNING alert'
' - defaults to 90% of the JVM setting')
def check(self, opts, args):
host = opts.host
port = int(opts.port or '9200')
critical = int(opts.critical_threshold or '97')
warning = int(opts.warning_threshold or '90')
try:
response = urllib2.urlopen(r'http://%s:%d/_nodes/stats/jvm'
% (host, port))
except urllib2.HTTPError, e:
raise Status('unknown', ("API failure", None,
"API failure:\n\n%s" % str(e)))
except urllib2.URLError, e:
raise Status('critical', (e.reason))
response_body = response.read()
try:
nodes_jvm_data = json.loads(response_body)
except ValueError:
raise Status('unknown', ("API returned nonsense",))
criticals = 0
critical_details = []
warnings = 0
warning_details = []
nodes = nodes_jvm_data['nodes']
for node in nodes:
jvm_percentage = nodes[node]['jvm']['mem']['heap_used_percent']
node_name = nodes[node]['host']
if int(jvm_percentage) >= critical:
criticals = criticals + 1
critical_details.append("%s currently running at %s%% JVM mem "
% (node_name, jvm_percentage))
elif (int(jvm_percentage) >= warning and
int(jvm_percentage) < critical):
warnings = warnings + 1
warning_details.append("%s currently running at %s%% JVM mem "
% (node_name, jvm_percentage))
if criticals > 0:
raise Status("Critical",
"There are '%s' node(s) in the cluster that have "
"breached the %% JVM heap usage critical threshold "
"of %s%%. They are:\r\n%s"
% (
criticals,
critical,
str("\r\n".join(critical_details))
))
elif warnings > 0:
raise Status("Warning",
"There are '%s' node(s) in the cluster that have "
"breached the %% JVM mem usage warning threshold of "
"%s%%. They are:\r\n%s"
% (warnings, warning,
str("\r\n".join(warning_details))))
else:
raise Status("OK", "All nodes in the cluster are currently below "
"the % JVM mem warning threshold")
if __name__ == "__main__":
ESJVMHealthCheck().run()