-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathcheck_puppet.rb
executable file
·208 lines (175 loc) · 7.75 KB
/
check_puppet.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
#!/usr/bin/ruby
# A simple nagios check that should be run as root
# perhaps under the mcollective NRPE plugin and
# can check when the last run was done of puppet.
# It can also check fail counts and skip machines
# that are not enabled
#
# The script will use the puppet last_run_summary.yaml
# file to determine when last Puppet ran else the age
# of the statefile.
#
# 19/12/2013 --- Change to lock files and handling of a puppet agent in a disabled state (WARNING)
# *** These changes are made to work with puppet 3.X and MAY cause some issues with 2.x users
# *** The script should still continue to work for 2.x, but may not handle the lockfiles correctly
# *** and will require the proper arguments to match 2.x filename.
require 'optparse'
require 'yaml'
statedir_puppet_3 = "/var/lib/puppet/state"
statedir_puppet_4 = "/opt/puppetlabs/puppet/cache/state"
File.directory?(statedir_puppet_4) ? statedir = statedir_puppet_4 : statedir = statedir_puppet_3
agent_lockfile = statedir + "/agent_catalog_run.lock"
agent_disabled_lockfile = statedir + "/agent_disabled.lock"
statefile = statedir + "/state.yaml"
summaryfile = statedir + "/last_run_summary.yaml"
enabled = true
running = false
lastrun_failed = false
lastrun = 0
lastrun_time = 0
failcount_resources = 0
failcount_events = 0
warn = 0
crit = 0
total_failure = false
enabled_only = false
failures = false
disable_perfdata = false
disabled_message = "reason not specified"
opt = OptionParser.new
opt.on("--critical [CRIT]", "-c", Integer, "Critical threshold, time or failed resources") do |f|
crit = f.to_i
end
opt.on("--warn [WARN]", "-w", Integer, "Warning threshold, time or failed resources") do |f|
warn = f.to_i
end
opt.on("--check-failures", "-f", "Check for failed resources instead of time since run") do |f|
failures = true
end
opt.on("--only-enabled", "-e", "Only alert if Puppet is enabled") do |f|
enabled_only = true
end
opt.on("--state-dir [FILE]", "Location of the state directory containing lock and state files, default #{statedir}, will change location of the files") do |f|
statedir = f
agent_lockfile = statedir + "/agent_catalog_run.lock"
agent_disabled_lockfile = statedir + "/agent_disabled.lock"
statefile = statedir + "/state.yaml"
summaryfile = statedir + "/last_run_summary.yaml"
end
opt.on("--agent-lock-file [FILE]", "-l", "Location of the agent run lock file, default #{agent_lockfile}") do |f|
agent_lockfile = f
end
opt.on("--agent-disabled-lock-file [FILE]", "-d", "Location of the agent disabled lock file, default #{agent_disabled_lockfile}") do |f|
agent_disabled_lockfile = f
end
opt.on("--state-file [FILE]", "-t", "Location of the state file, default #{statefile}") do |f|
statefile = f
end
opt.on("--summary-file [FILE]", "-s", "Location of the summary file, default #{summaryfile}") do |f|
summaryfile = f
end
opt.on("--disable-perfdata", "-x", "Disable performance data output") do |f|
disable_perfdata = f
end
opt.parse!
if warn == 0 || crit == 0
puts "Please specify a warning and critical level"
exit 3
end
if File.exists?(agent_lockfile)
if File::Stat.new(agent_lockfile).zero?
enabled = false
else
running = true
end
end
if File.exists?(agent_disabled_lockfile)
enabled = false
disabled_message = File.open(agent_disabled_lockfile, 'r').read.gsub(/.*\"(.*)\"\}/, '\1') || "reason not specified"
end
lastrun = File.stat(statefile).mtime.to_i if File.exists?(statefile)
unless File.readable?(summaryfile)
puts "UNKNOWN: Summary file not found or not readable. Check #{summaryfile}"
exit 3
else
begin
summary = YAML.load_file(summaryfile)
lastrun = summary["time"]["last_run"]
lastrun_time = (summary["time"]["total"] || 0).round(2)
# machines that outright failed to run like on missing dependencies
# are treated as huge failures. The yaml file will be valid but
# it wont have anything but last_run in it
unless summary.include?("events")
failcount_resources = 99
failcount_events = 99
total_failure = true
else
# and unless there are failures, the events hash just wont have the failure count
failcount_resources = summary["resources"]["failed"] || 0
failcount_events = summary["events"]["failure"] || 0
end
rescue
failcount_resources = 0
failcount_events = 0
summary = nil
end
end
time_since_last_run = Time.now.to_i - lastrun
time_since_last_run_string = "#{time_since_last_run} seconds ago"
if time_since_last_run >= 3600
time_since_last_run_string = "#{time_since_last_run / 60 / 60} hours ago at #{Time.at(Time.now - time_since_last_run).utc.strftime('%R:%S')} UTC"
elsif time_since_last_run >= 60
time_since_last_run_string = "#{time_since_last_run / 60} minutes ago"
end
if disable_perfdata
perfdata_time = ""
else
perfdata_time = "|time_since_last_run=#{time_since_last_run}s;#{warn};#{crit};0 failed_resources=#{failcount_resources};;;0 failed_events=#{failcount_events};;;0 last_run_duration=#{lastrun_time};;;0"
end
unless failures
if enabled_only && enabled == false
puts "OK: Puppet is currently disabled, not alerting. Last run #{time_since_last_run_string} with #{failcount_resources} failed resources #{failcount_events} failed events. Disabled with reason: #{disabled_message}#{perfdata_time}"
exit 0
end
if total_failure
puts "CRITICAL: FAILED - Puppet failed to run. Missing dependencies? Catalog compilation failed? Last run #{time_since_last_run_string}#{perfdata_time}"
exit 2
elsif time_since_last_run >= crit
puts "CRITICAL: last run #{time_since_last_run_string}, expected < #{crit}s#{perfdata_time}"
exit 2
elsif time_since_last_run >= warn
puts "WARNING: last run #{time_since_last_run_string}, expected < #{warn}s#{perfdata_time}"
exit 1
else
if enabled
puts "OK: last run #{time_since_last_run_string} with #{failcount_resources} failed resources #{failcount_events} failed events and currently enabled#{perfdata_time}"
else
puts "WARNING: last run #{time_since_last_run_string} with #{failcount_resources} failed resources #{failcount_events} failed events and currently disabled with reason: #{disabled_message}#{perfdata_time}"
exit 1
end
exit 0
end
else
if enabled_only && enabled == false
puts "OK: Puppet is currently disabled, not alerting. Last run #{time_since_last_run_string} with #{failcount_resources} failed resources #{failcount_events} failed events. Disabled with reason: #{disabled_message}#{perfdata_time}"
exit 0
end
if total_failure
puts "CRITICAL: FAILED - Puppet failed to run. Missing dependencies? Catalog compilation failed? Last run #{time_since_last_run_string}#{perfdata_time}"
exit 2
elsif failcount_resources >= crit
puts "CRITICAL: Puppet last ran had #{failcount_resources} failed resources #{failcount_events} failed events, expected < #{crit}#{perfdata_time}"
exit 2
elsif failcount_resources >= warn
puts "WARNING: Puppet last ran had #{failcount_resources} failed resources #{failcount_events} failed events, expected < #{warn}#{perfdata_time}"
exit 1
else
if enabled
puts "OK: last run #{time_since_last_run_string} with #{failcount_resources} failed resources #{failcount_events} failed events and currently enabled#{perfdata_time}"
else
puts "WARNING: last run #{time_since_last_run_string} with #{failcount_resources} failed resources #{failcount_events} failed events and currently disabled with reason: #{disabled_message}#{perfdata_time}"
exit 1
end
exit 0
end
end