This repository has been archived by the owner on Feb 13, 2020. It is now read-only.
forked from larseen/pgbouncer_exporter
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcollector.go
521 lines (466 loc) · 21.2 KB
/
collector.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
package main
import (
"database/sql"
"errors"
"fmt"
"math"
"strconv"
"sync"
"time"
"github.com/lib/pq"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/log"
)
type columnUsage int
const (
LABEL columnUsage = iota // Use this column as a label
COUNTER columnUsage = iota // Use this column as a counter
GAUGE columnUsage = iota // Use this column as a gauge
GAUGE_MS columnUsage = iota // Use this column for gauges that are microsecond data
)
type rowResult struct {
ColumnNames []string
ColumnIdx map[string]int
ColumnData []interface{}
}
type RowConverter func(*MetricMapNamespace, *rowResult, chan<- prometheus.Metric) ([]error, error)
// Groups metric maps under a shared set of labels
type MetricMapNamespace struct {
namespace string
columnMappings map[string]MetricMap // Column mappings in this namespace
labels []string
rowFunc RowConverter
}
// the scrape fails, and a slice of errors if they were non-fatal.
func (m *MetricMapNamespace) Query(ch chan<- prometheus.Metric, db *sql.DB) ([]error, error) {
query := fmt.Sprintf("SHOW %s;", m.namespace)
// Don't fail on a bad scrape of one metric
rows, err := db.Query(query)
if err != nil {
return []error{}, errors.New(fmt.Sprintln("Error running query on database: ", m.namespace, err))
}
defer rows.Close()
var result rowResult
result.ColumnNames, err = rows.Columns()
if err != nil {
return []error{}, errors.New(fmt.Sprintln("Error retrieving column list for: ", m.namespace, err))
}
// Make a lookup map for the column indices
result.ColumnIdx = make(map[string]int, len(result.ColumnNames))
for i, n := range result.ColumnNames {
result.ColumnIdx[n] = i
}
result.ColumnData = make([]interface{}, len(result.ColumnNames))
var scanArgs = make([]interface{}, len(result.ColumnNames))
for i := range result.ColumnData {
scanArgs[i] = &(result.ColumnData[i])
}
nonfatalErrors := []error{}
for rows.Next() {
err = rows.Scan(scanArgs...)
if err != nil {
return []error{}, errors.New(fmt.Sprintln("Error retrieving rows:", m.namespace, err))
}
n, e := m.rowFunc(m, &result, ch)
if n != nil {
nonfatalErrors = append(nonfatalErrors, n...)
}
if e != nil {
return nonfatalErrors, e
}
}
if err := rows.Err(); err != nil {
log.Errorf("Failed scaning all rows due to scan failure: error was; %s", err)
nonfatalErrors = append(nonfatalErrors, errors.New(fmt.Sprintf("Failed to consume all rows due to: %s", err)))
}
return nonfatalErrors, nil
}
func metricRowConverter(m *MetricMapNamespace, result *rowResult, ch chan<- prometheus.Metric) ([]error, error) {
var nonFatalErrors []error
labelValues := []string{}
// collect label data first.
for _, name := range m.labels {
val := result.ColumnData[result.ColumnIdx[name]]
if val == nil {
labelValues = append(labelValues, "")
} else if v, ok := val.(string); ok {
labelValues = append(labelValues, v)
} else if v, ok := val.(int64); ok {
labelValues = append(labelValues, strconv.FormatInt(v, 10))
}
}
for idx, columnName := range result.ColumnNames {
if metricMapping, ok := m.columnMappings[columnName]; ok {
value, ok := dbToFloat64(result.ColumnData[idx])
if !ok {
nonFatalErrors = append(nonFatalErrors, errors.New(fmt.Sprintln("Unexpected error parsing column: ", m.namespace, columnName, result.ColumnData[idx])))
continue
}
log.Debugln("successfully parsed column:", m.namespace, columnName, result.ColumnData[idx])
// Generate the metric
ch <- prometheus.MustNewConstMetric(metricMapping.desc, metricMapping.vtype, value*metricMapping.multiplier, labelValues...)
} else {
log.Debugln("Ignoring column for metric conversion:", m.namespace, columnName)
}
}
return nonFatalErrors, nil
}
func metricKVConverter(m *MetricMapNamespace, result *rowResult, ch chan<- prometheus.Metric) ([]error, error) {
// format is key, value, <ignorable> for row results.
if len(result.ColumnData) < 2 {
return nil, errors.New(fmt.Sprintln("Received row results for KV parsing, but not enough columns; something is deeply broken:", m.namespace, result.ColumnData))
}
var key string
switch v := result.ColumnData[0].(type) {
case string:
key = v
default:
return nil, errors.New(fmt.Sprintln("Received row results for KV parsing, but key field isn't string:", m.namespace, result.ColumnData))
}
// is it a key we care about?
if metricMapping, ok := m.columnMappings[key]; ok {
value, ok := dbToFloat64(result.ColumnData[1])
if !ok {
return append([]error{}, errors.New(fmt.Sprintln("Unexpected error KV value: ", m.namespace, key, result.ColumnData[1]))), nil
}
log.Debugln("successfully parsed column:", m.namespace, key, result.ColumnData[1])
// Generate the metric
ch <- prometheus.MustNewConstMetric(metricMapping.desc, metricMapping.vtype, value*metricMapping.multiplier)
} else {
log.Debugln("Ignoring column for KV conversion:", m.namespace, key)
}
return nil, nil
}
// Stores the prometheus metric description which a given column will be mapped
// to by the collector
type MetricMap struct {
vtype prometheus.ValueType // Prometheus valuetype
namespace string
desc *prometheus.Desc // Prometheus descriptor
multiplier float64 // This is a multiplier to apply pgbouncer values in converting to prometheus norms.
}
type ColumnMapping struct {
usage columnUsage `yaml:"usage"`
description string `yaml:"description"`
}
// Exporter collects PgBouncer stats from the given server and exports
// them using the prometheus metrics package.
type Exporter struct {
connectionString string
namespace string
mutex sync.RWMutex
duration, up, error prometheus.Gauge
totalScrapes prometheus.Counter
metricMap []*MetricMapNamespace
db *sql.DB
}
var metricKVMaps = map[string]map[string]ColumnMapping{
"config": {
"listen_backlog": {COUNTER, "Maximum number of backlogged listen connections before further connection attempts are dropped"},
"max_client_conn": {GAUGE, "Maximum number of client connections allowed"},
"default_pool_size": {GAUGE, "The default for how many server connections to allow per user/database pair"},
"min_pool_size": {GAUGE, "Mininum number of backends a pool will always retain."},
"reserve_pool_size": {GAUGE, "How many additional connections to allow to a pool once it's crossed it's maximum"},
"reserve_pool_timeout": {GAUGE, "If a client has not been serviced in this many seconds, pgbouncer enables use of additional connections from reserve pool."},
"max_db_connections": {GAUGE, "Server level maximum connections enforced for a given db, irregardless of pool limits"},
"max_user_connections": {GAUGE, "Maximum number of connections a user can open irregardless of pool limits"},
"autodb_idle_timeout": {GAUGE, "Unused pools created via '*' are reclaimed after this interval"},
// server_reset_query should be enabled as a label for just this metric.
"server_reset_query_always": {GAUGE, "Boolean indicating whether or not server_reset_query is enforced for all pooling modes, or just session"},
// server_check_query should be enabled as a label for just this metric.
"server_check_delay": {GAUGE, "How long to keep released connections available for immediate re-use, without running sanity-check queries on it. If 0 then the query is ran always."},
"query_timeout": {GAUGE, "Maximum time that a query can run for before being cancelled."},
"query_wait_timeout": {GAUGE, "Maximum time that a query can wait to be executed before being cancelled."},
"client_idle_timeout": {GAUGE, "Client connections idling longer than this many seconds are closed"},
"client_login_timeout": {GAUGE, "Maximum time in seconds for a client to either login, or be disconnected"},
"idle_transaction_timeout": {GAUGE, "If client has been in 'idle in transaction' state longer than this amount in seconds, it will be disconnected."},
"server_lifetime": {GAUGE, "The pooler will close an unused server connection that has been connected longer than this many seconds"},
"server_idle_timeout": {GAUGE, "If a server connection has been idle more than this many seconds it will be dropped"},
"server_connect_timeout": {GAUGE, "Maximum time allowed for connecting and logging into a backend server"},
"server_login_retry": {GAUGE, "If connecting to a backend failed, this is the wait interval in seconds before retrying"},
"server_round_robin": {GAUGE, "Boolean; if 1, pgbouncer uses backends in a round robin fashion. If 0, it uses LIFO to minimize connectivity to backends"},
"suspend_timeout": {GAUGE, "Timeout for how long pgbouncer waits for buffer flushes before killing connections during pgbouncer admin SHUTDOWN and SUSPEND invocations."},
"disable_pqexec": {COUNTER, "Boolean; 1 means pgbouncer enforce Simple Query Protocol; 0 means it allows multiple queries in a single packet"},
"dns_max_ttl": {GAUGE, "Irregardless of DNS TTL, this is the TTL that pgbouncer enforces for dns lookups it does for backends"},
"dns_nxdomain_ttl": {GAUGE, "Irregardless of DNS TTL, this is the period enforced for negative DNS answers"},
"dns_zone_check_period": {GAUGE, "Period to check if zone serial has changed."},
"max_packet_size": {GAUGE, "Maximum packet size for postgresql packets that pgbouncer will relay to backends"},
"pkt_buf": {COUNTER, "Internal buffer size for packets. See docs"},
"sbuf_loopcnt": {GAUGE, "How many results to process for a given connection's packet results before switching to others to ensure fairness. See docs."},
"tcp_defer_accept": {GAUGE, "Configurable for TCP_DEFER_ACCEPT"},
"tcp_socket_buffer": {GAUGE, "Configurable for tcp socket buffering; 0 is kernel managed"},
"tcpkeepalive": {GAUGE, "Boolean; if 1, tcp keepalive is enabled w/ OS defaults. If 0, disabled."},
"tcp_keepcnt": {GAUGE, "See TCP documentation for this field"},
"tcp_keepidle": {GAUGE, "See TCP documentation for this field"},
"tcp_keepintvl": {GAUGE, "See TCP documentation for this field"},
"verbose": {GAUGE, "If log verbosity is increased. Only relevant as a metric if log volume begins exceeding log consumption"},
"stats_period": {GAUGE, "Periodicity in seconds of pgbouncer recalculating internal stats."},
"log_connections": {GAUGE, "Whether connections are logged or not."},
"log_disconnections": {GAUGE, "Whether connection disconnects are logged."},
"log_pooler_errors": {GAUGE, "Whether pooler errors are logged or not"},
"application_name_add_host": {GAUGE, "Whether pgbouncer add the client host address and port to the application name setting set on connection start or not"},
},
}
var metricRowMaps = map[string]map[string]ColumnMapping{
"databases": {
"name": {LABEL, ""},
"host": {LABEL, ""},
"port": {LABEL, ""},
"database": {LABEL, ""},
"force_user": {LABEL, ""},
"pool_size": {GAUGE, "Maximum number of pool backend connections"},
"reserve_pool": {GAUGE, "Maximum amount that the pool size can be exceeded temporarily"},
"pool_mode": {LABEL, ""},
"max_connections": {GAUGE, "Maximum number of client connections allowed"},
"current_connections": {GAUGE, "Current number of client connections"},
"paused": {GAUGE, "Boolean indicating whether a pgbouncer PAUSE is currently active for this database"},
"disabled": {GAUGE, "Boolean indicating whether a pgbouncer DISABLE is currently active for this database"},
},
"lists": {
"databases": {GAUGE, "Count of databases"},
"users": {GAUGE, "Count of users"},
"pools": {GAUGE, "Count of pools"},
"free_clients": {GAUGE, "Count of free clients"},
"used_clients": {GAUGE, "Count of used clients"},
"login_clients": {GAUGE, "Count of clients in login state"},
"free_servers": {GAUGE, "Count of free servers"},
"used_servers": {GAUGE, "Count of used servers"},
},
"pools": {
"database": {LABEL, ""},
"user": {LABEL, ""},
"cl_active": {GAUGE, "Client connections linked to server connection and able to process queries, shown as connection"},
"cl_waiting": {GAUGE, "Client connections waiting on a server connection, shown as connection"},
"sv_active": {GAUGE, "Server connections linked to a client connection, shown as connection"},
"sv_idle": {GAUGE, "Server connections idle and ready for a client query, shown as connection"},
"sv_used": {GAUGE, "Server connections idle more than server_check_delay, needing server_check_query, shown as connection"},
"sv_tested": {GAUGE, "Server connections currently running either server_reset_query or server_check_query, shown as connection"},
"sv_login": {GAUGE, "Server connections currently in the process of logging in, shown as connection"},
"maxwait": {GAUGE, "Age of oldest unserved client connection, shown as second"},
"pool_mode": {LABEL, ""},
},
"stats": {
"database": {LABEL, ""},
"avg_query_count": {GAUGE, "Average queries per second in last stat period"},
"avg_query": {GAUGE_MS, "The average query duration, shown as microsecond"},
"avg_query_time": {GAUGE_MS, "Average query duration in microseconds"},
"avg_recv": {GAUGE, "Average received (from clients) bytes per second"},
"avg_req": {GAUGE, "The average number of requests per second in last stat period, shown as request/second"},
"avg_sent": {GAUGE, "Average sent (to clients) bytes per second"},
"avg_wait_time": {GAUGE_MS, "Time spent by clients waiting for a server in microseconds (average per second)"},
"avg_xact_count": {GAUGE, "Average transactions per second in last stat period"},
"avg_xact_time": {GAUGE_MS, "Average transaction duration in microseconds"},
"bytes_received_per_second": {GAUGE, "The total network traffic received, shown as byte/second"},
"bytes_sent_per_second": {GAUGE, "The total network traffic sent, shown as byte/second"},
"total_query_count": {GAUGE, "Total number of SQL queries pooled"},
"total_query_time": {GAUGE_MS, "Total number of microseconds spent by pgbouncer when actively connected to PostgreSQL, executing queries"},
"total_received": {GAUGE, "Total volume in bytes of network traffic received by pgbouncer, shown as bytes"},
"total_requests": {GAUGE, "Total number of SQL requests pooled by pgbouncer, shown as requests"},
"total_sent": {GAUGE, "Total volume in bytes of network traffic sent by pgbouncer, shown as bytes"},
"total_wait_time": {GAUGE_MS, "Time spent by clients waiting for a server in microseconds"},
"total_xact_count": {GAUGE, "Total number of SQL transactions pooled"},
"total_xact_time": {GAUGE_MS, "Total number of microseconds spent by pgbouncer when connected to PostgreSQL in a transaction, either idle in transaction or executing queries"},
},
}
func NewExporter(connectionString string, namespace string) *Exporter {
db, err := getDB(connectionString)
if err != nil {
log.Fatal(err)
}
return &Exporter{
metricMap: makeDescMap(namespace),
namespace: namespace,
db: db,
up: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Name: "up",
Help: "Was the PgBouncer instance query successful?",
}),
duration: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Name: "last_scrape_duration_seconds",
Help: "Duration of the last scrape of metrics from PgBouncer.",
}),
totalScrapes: prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Name: "scrapes_total",
Help: "Total number of times PgBouncer has been scraped for metrics.",
}),
error: prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Name: "last_scrape_error",
Help: "Whether the last scrape of metrics from PgBouncer resulted in an error (1 for error, 0 for success).",
}),
}
}
// Query within a namespace mapping and emit metrics. Returns fatal errors if
func getDB(conn string) (*sql.DB, error) {
// Note we use OpenDB so we can still create the connector even if the backend is down.
connector, err := pq.NewConnector(conn)
if err != nil {
return nil, err
}
db := sql.OpenDB(connector)
db.SetMaxOpenConns(1)
db.SetMaxIdleConns(1)
return db, nil
}
// Convert database.sql to string for Prometheus labels. Null types are mapped to empty strings.
func dbToString(t interface{}) (string, bool) {
switch v := t.(type) {
case int64:
return fmt.Sprintf("%v", v), true
case float64:
return fmt.Sprintf("%v", v), true
case time.Time:
return fmt.Sprintf("%v", v.Unix()), true
case nil:
return "", true
case []byte:
// Try and convert to string
return string(v), true
case string:
return v, true
default:
return "", false
}
}
// Convert database.sql types to float64s for Prometheus consumption. Null types are mapped to NaN. string and []byte
// types are mapped as NaN and !ok
func dbToFloat64(t interface{}) (float64, bool) {
switch v := t.(type) {
case int64:
return float64(v), true
case float64:
return v, true
case time.Time:
return float64(v.Unix()), true
case []byte:
// Try and convert to string and then parse to a float64
strV := string(v)
result, err := strconv.ParseFloat(strV, 64)
if err != nil {
return math.NaN(), false
}
return result, true
case string:
result, err := strconv.ParseFloat(v, 64)
if err != nil {
log.Infoln("Could not parse string:", err)
return math.NaN(), false
}
return result, true
case nil:
return math.NaN(), true
default:
return math.NaN(), false
}
}
// Describe implements prometheus.Collector.
func (e *Exporter) Describe(ch chan<- *prometheus.Desc) {
// We cannot know in advance what metrics the exporter will generate
// from Postgres. So we use the poor man's describe method: Run a collect
// and send the descriptors of all the collected metrics. The problem
// here is that we need to connect to the Postgres DB. If it is currently
// unavailable, the descriptors will be incomplete. Since this is a
// stand-alone exporter and not used as a library within other code
// implementing additional metrics, the worst that can happen is that we
// don't detect inconsistent metrics created by this exporter
// itself. Also, a change in the monitored Postgres instance may change the
// exported metrics during the runtime of the exporter.
metricCh := make(chan prometheus.Metric)
doneCh := make(chan struct{})
go func() {
for m := range metricCh {
ch <- m.Desc()
}
close(doneCh)
}()
e.Collect(metricCh)
close(metricCh)
<-doneCh
}
// Collect implements prometheus.Collector.
func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
e.scrape(ch)
ch <- e.duration
ch <- e.up
ch <- e.totalScrapes
ch <- e.error
}
func (e *Exporter) scrape(ch chan<- prometheus.Metric) {
defer func(begun time.Time) {
e.duration.Set(time.Since(begun).Seconds())
log.Info("Ending scrape")
}(time.Now())
log.Info("Starting scrape")
e.mutex.RLock()
defer e.mutex.RUnlock()
e.error.Set(0)
e.totalScrapes.Inc()
if err := e.db.Ping(); err != nil {
log.Errorf("Backend is down, failed to connect: %s", err)
e.error.Set(1)
e.up.Set(0)
return
}
log.Debug("Backend is up, proceeding with scrape")
e.up.Set(1)
for _, mapping := range e.metricMap {
nonfatal, err := mapping.Query(ch, e.db)
if len(nonfatal) > 0 {
for _, suberr := range nonfatal {
log.Errorln(suberr.Error())
}
}
if err != nil {
// this needs to be removed.
log.Fatal(err)
}
e.error.Add(float64(len(nonfatal)))
}
}
func makeDescMap(metricNamespace string) []*MetricMapNamespace {
var metricMap []*MetricMapNamespace
convert := func(namespace string, mappings map[string]ColumnMapping, converter RowConverter) *MetricMapNamespace {
thisMap := make(map[string]MetricMap)
labels := []string{}
for columnName, columnMapping := range mappings {
if columnMapping.usage == LABEL {
labels = append(labels, columnName)
}
}
for columnName, columnMapping := range mappings {
// Determine how to convert the column based on its usage.
desc := prometheus.NewDesc(fmt.Sprintf("%s_%s_%s", metricNamespace, namespace, columnName), columnMapping.description, labels, nil)
switch columnMapping.usage {
case COUNTER:
thisMap[columnName] = MetricMap{
vtype: prometheus.CounterValue,
desc: desc,
multiplier: 1,
}
case GAUGE:
thisMap[columnName] = MetricMap{
vtype: prometheus.GaugeValue,
desc: desc,
multiplier: 1,
}
case GAUGE_MS:
thisMap[columnName] = MetricMap{
vtype: prometheus.GaugeValue,
desc: desc,
multiplier: 1e-6,
}
}
}
return &MetricMapNamespace{namespace: namespace, columnMappings: thisMap, labels: labels, rowFunc: converter}
}
for namespace, mappings := range metricRowMaps {
metricMap = append(metricMap, convert(namespace, mappings, metricRowConverter))
}
for namespace, mappings := range metricKVMaps {
metricMap = append(metricMap, convert(namespace, mappings, metricKVConverter))
}
return metricMap
}