forked from vimeo/docker-skyline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathskyline-settings.py
234 lines (191 loc) · 8.2 KB
/
skyline-settings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
"""
Shared settings
"""
# The path for the Redis unix socket
REDIS_SOCKET_PATH = '/tmp/redis.sock'
# The Skyline logs directory. Do not include a trailing slash.
LOG_PATH = '/var/log/skyline'
# The Skyline pids directory. Do not include a trailing slash.
PID_PATH = '/var/run/skyline'
# Metrics will be prefixed with this value in Redis.
FULL_NAMESPACE = 'metrics.'
# The Horizon agent will make T'd writes to both the full namespace and the
# mini namespace. Oculus gets its data from everything in the mini namespace.
MINI_NAMESPACE = 'mini.'
# This is the rolling duration that will be stored in Redis. Be sure to pick a
# value that suits your memory capacity, your CPU capacity, and your overall
# metrics count. Longer durations take a longer to analyze, but they can
# help the algorithms reduce the noise and provide more accurate anomaly
# detection.
FULL_DURATION = 86400
# This is the duration of the 'mini' namespace, if you are also using the
# Oculus service. It is also the duration of data that is displayed in the
# web app 'mini' view.
MINI_DURATION = 3600
# Destination hostname for horizon and analyzer to send their metrics to.
GRAPHITE_HOST = 'your_carbon_host'
# Destination port for horizon and analyzer to send their metrics to.
CARBON_PORT = 2003
# The Graph url used to link to Graphite (Or another graphite dashboard)
# %s will be replaced by the metric name
GRAPH_URL = 'http://your_graphite_host/render/?width=1400&from=-1hour&target=%s'
# If you have Oculus set up, set this metric to set the clickthrough on the
# webapp. Include http://. If you don't want to use Oculus, set this to an
# empty string. If you comment this out, Skyline won't work! Speed improvements
# will occur when Oculus support is disabled.
OCULUS_HOST = 'http://your_oculus_host.com'
"""
Analyzer settings
"""
# This is the location the Skyline agent will write the anomalies file to disk.
# It needs to be in a location accessible to the webapp.
ANOMALY_DUMP = 'webapp/static/dump/anomalies.json'
# This is the number of processes that the Skyline analyzer will spawn.
# Analysis is a very CPU-intensive procedure. You will see optimal results
# if you set ANALYZER_PROCESSES to several less than the total number of
# CPUs on your box. Be sure to leave some CPU room for the Horizon workers,
# and for Redis.
ANALYZER_PROCESSES = 5
# This is the duration, in seconds, for a metric to become 'stale' and for
# the analyzer to ignore it until new datapoints are added. 'Staleness' means
# that a datapoint has not been added for STALE_PERIOD seconds.
STALE_PERIOD = 500
# This is the minimum length of a timeseries, in datapoints, for the analyzer
# to recognize it as a complete series.
MIN_TOLERABLE_LENGTH = 1
# Sometimes a metric will continually transmit the same number. There's no need
# to analyze metrics that remain boring like this, so this setting determines
# the amount of boring datapoints that will be allowed to accumulate before the
# analyzer skips over the metric. If the metric becomes noisy again, the
# analyzer will stop ignoring it.
MAX_TOLERABLE_BOREDOM = 100
# By default, the analyzer skips a metric if it it has transmitted a single
# number MAX_TOLERABLE_BOREDOM times. Change this setting if you wish the size
# of the ignored set to be higher (ie, ignore the metric if there have only
# been two different values for the past MAX_TOLERABLE_BOREDOM datapoints).
# This is useful for timeseries that often oscillate between two values.
BOREDOM_SET_SIZE = 1
# The canary metric should be a metric with a very high, reliable resolution
# that you can use to gauge the status of the system as a whole.
CANARY_METRIC = 'statsd.numStats'
# These are the algorithms that the Analyzer will run. To add a new algorithm,
# you must both define the algorithm in algorithms.py and add its name here.
ALGORITHMS = [
'first_hour_average',
'mean_subtraction_cumulation',
'stddev_from_average',
'stddev_from_moving_average',
'least_squares',
'grubbs',
'histogram_bins',
'median_absolute_deviation',
'ks_test',
]
# This is the number of algorithms that must return True before a metric is
# classified as anomalous.
CONSENSUS = 6
# This is to enable second order anomalies. This is an experimental feature, so
# it's turned off by default.
ENABLE_SECOND_ORDER = False
# This enables alerting.
ENABLE_ALERTS = True
# This is the config for which metrics to alert on and which strategy to use for each.
# Alerts will not fire twice within EXPIRATION_TIME, even if they trigger again.
# Schema: (
# ("metric1", "smtp", EXPIRATION_TIME),
# ("metric2", "pagerduty", EXPIRATION_TIME),
# ("metric3", "hipchat", EXPIRATION_TIME),
# )
ALERTS = (
("skyline", "smtp", 1800),
)
# Each alert module requires additional information.
SMTP_OPTS = {
# This specifies the sender of email alerts.
"sender": "skyline-alerts@etsy.com",
# recipients is a dictionary mapping metric names
# (exactly matching those listed in ALERTS) to an array of e-mail addresses
"recipients": {
"skyline": ["abe@etsy.com", "you@yourcompany.com"],
},
}
# HipChat alerts require python-simple-hipchat
HIPCHAT_OPTS = {
"auth_token": "pagerduty_auth_token",
# list of hipchat room_ids to notify about each anomaly
# (similar to SMTP_OPTS['recipients'])
"rooms": {
"skyline": (12345,),
},
# Background color of hipchat messages
# (One of "yellow", "red", "green", "purple", "gray", or "random".)
"color": "purple",
}
# PagerDuty alerts require pygerduty
PAGERDUTY_OPTS = {
# Your pagerduty subdomain and auth token
"subdomain": "example",
"auth_token": "your_pagerduty_auth_token",
# Service API key (shown on the detail page of a "Generic API" service)
"key": "your_pagerduty_service_api_key",
}
"""
Horizon settings
"""
# This is the number of worker processes that will consume from the Horizon
# queue.
WORKER_PROCESSES = 2
# The IP address for Horizon to listen on. Defaults to gethostname()
# HORIZON_IP = '0.0.0.0'
# This is the port that listens for Graphite pickles over TCP, sent by Graphite's
# carbon-relay agent.
PICKLE_PORT = 2024
# This is the port that listens for Messagepack-encoded UDP packets.
UDP_PORT = 2025
# This is how big a 'chunk' of metrics will be before they are added onto
# the shared queue for processing into Redis. If you are noticing that Horizon
# is having trouble consuming metrics, try setting this value a higher.
CHUNK_SIZE = 10
# This is the maximum allowable length of the processing queue before new
# chunks are prevented from being added. If you consistently fill up the
# processing queue, a higher MAX_QUEUE_SIZE will not save you. It most likely
# means that the workers do not have enough CPU alotted in order to process the
# queue on time. Try increasing CHUNK_SIZE, decreasing ANALYZER_PROCESSES, or
# decreasing ROOMBA_PROCESSES.
MAX_QUEUE_SIZE = 500
# This is the number of Roomba processes that will be spawned to trim
# timeseries in order to keep them at FULL_DURATION. Keep this number small,
# as it is not important that metrics be exactly FULL_DURATION *all* the time.
ROOMBA_PROCESSES = 1
# Normally Roomba will clean up everything that is older than FULL_DURATION
# if you have metrics that are not coming in every second, it can happen
# that you'll end up with INCOMPLETE metrics.
# With this setting Roomba will clean up evertyhing that is older than
# FULL_DURATION + ROOMBA_GRACE_TIME
ROOMBA_GRACE_TIME = 600
# The Horizon agent will ignore incoming datapoints if their timestamp
# is older than MAX_RESOLUTION seconds ago.
MAX_RESOLUTION = 1000
# These are metrics that, for whatever reason, you do not want to store
# in Skyline. The Listener will check to see if each incoming metrics
# contains anything in the skip list. It is generally wise to skip entire
# namespaces by adding a '.' at the end of the skipped item - otherwise
# you might skip things you don't intend to.
SKIP_LIST = [
'example.statsd.metric',
'another.example.metric',
# if you use statsd, these can result in many near-equal series
#'_90',
#'.lower',
#'.upper',
#'.median',
#'.count_ps',
#'.sum',
]
"""
Webapp settings
"""
# The IP address for the webapp
WEBAPP_IP = '127.0.0.1'
# The port for the webapp
WEBAPP_PORT = 1500