Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add metrics to measure processing time #499

Merged
merged 1 commit into from
Jan 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions clustering/process.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ type metricsSet struct {
replicas prometheus.Gauge
readyReplicas prometheus.Gauge
errantReplicas prometheus.Gauge
processingTime prometheus.Observer

backupTimestamp prometheus.Gauge
backupElapsed prometheus.Gauge
Expand Down Expand Up @@ -75,6 +76,7 @@ func newManagerProcess(c client.Client, r client.Reader, recorder record.EventRe
replicas: metrics.TotalReplicasVec.WithLabelValues(name.Name, name.Namespace),
readyReplicas: metrics.ReadyReplicasVec.WithLabelValues(name.Name, name.Namespace),
errantReplicas: metrics.ErrantReplicasVec.WithLabelValues(name.Name, name.Namespace),
processingTime: metrics.ProcessingTimeVec.WithLabelValues(name.Name, name.Namespace),
backupTimestamp: metrics.BackupTimestamp.WithLabelValues(name.Name, name.Namespace),
backupElapsed: metrics.BackupElapsed.WithLabelValues(name.Name, name.Namespace),
backupDumpSize: metrics.BackupDumpSize.WithLabelValues(name.Name, name.Namespace),
Expand All @@ -92,6 +94,7 @@ func newManagerProcess(c client.Client, r client.Reader, recorder record.EventRe
metrics.TotalReplicasVec.DeleteLabelValues(name.Name, name.Namespace)
metrics.ReadyReplicasVec.DeleteLabelValues(name.Name, name.Namespace)
metrics.ErrantReplicasVec.DeleteLabelValues(name.Name, name.Namespace)
metrics.ProcessingTimeVec.DeleteLabelValues(name.Name, name.Namespace)
metrics.BackupTimestamp.DeleteLabelValues(name.Name, name.Namespace)
metrics.BackupElapsed.DeleteLabelValues(name.Name, name.Namespace)
metrics.BackupDumpSize.DeleteLabelValues(name.Name, name.Namespace)
Expand Down Expand Up @@ -130,7 +133,9 @@ func (p *managerProcess) Start(ctx context.Context, interval time.Duration) {
}

p.metrics.checkCount.Inc()
startTime := time.Now()
redo, err := p.do(ctx)
p.metrics.processingTime.Observe(time.Since(startTime).Seconds())
if err != nil {
p.metrics.errorCount.Inc()
p.log.Error(err, "error")
Expand Down
31 changes: 16 additions & 15 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,22 @@ Aside from [the standard Go runtime and process metrics][standard], it exposes m

All these metrics are prefixed with `moco_cluster_` and have `name` and `namespace` labels.

| Name | Description | Type |
|-------------------------------------|------------------------------------------------------------------------|---------|
| `checks_total` | The number of times MOCO checked the cluster | Counter |
| `errors_total` | The number of times MOCO encountered errors when managing the cluster | Counter |
| `available` | 1 if the cluster is available, 0 otherwise | Gauge |
| `healthy` | 1 if the cluster is running without any problems, 0 otherwise | Gauge |
| `switchover_total` | The number of times MOCO changed the live primary instance | Counter |
| `failover_total` | The number of times MOCO changed the failed primary instance | Counter |
| `replicas` | The number of mysqld instances in the cluster | Gauge |
| `ready_replicas` | The number of ready mysqld Pods in the cluster | Gauge |
| `errant_replicas` | The number of mysqld instances that have [errant transactions][errant] | Gauge |
| `volume_resized_total` | The number of successful volume resizes | Counter |
| `volume_resized_errors_total` | The number of failed volume resizes | Counter |
| `statefulset_recreate_total` | The number of successful StatefulSet recreates | Counter |
| `statefulset_recreate_errors_total` | The number of failed StatefulSet recreates | Counter |
| Name | Description | Type |
| ----------------------------------- | ---------------------------------------------------------------------- | --------- |
| `checks_total` | The number of times MOCO checked the cluster | Counter |
| `errors_total` | The number of times MOCO encountered errors when managing the cluster | Counter |
| `available` | 1 if the cluster is available, 0 otherwise | Gauge |
| `healthy` | 1 if the cluster is running without any problems, 0 otherwise | Gauge |
| `switchover_total` | The number of times MOCO changed the live primary instance | Counter |
| `failover_total` | The number of times MOCO changed the failed primary instance | Counter |
| `replicas` | The number of mysqld instances in the cluster | Gauge |
| `ready_replicas` | The number of ready mysqld Pods in the cluster | Gauge |
| `errant_replicas` | The number of mysqld instances that have [errant transactions][errant] | Gauge |
| `processing_time_seconds` | The length of time in seconds processing the cluster | Histogram |
| `volume_resized_total` | The number of successful volume resizes | Counter |
| `volume_resized_errors_total` | The number of failed volume resizes | Counter |
| `statefulset_recreate_total` | The number of successful StatefulSet recreates | Counter |
| `statefulset_recreate_errors_total` | The number of failed StatefulSet recreates | Counter |

### Backup

Expand Down
10 changes: 10 additions & 0 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ var (
TotalReplicasVec *prometheus.GaugeVec
ReadyReplicasVec *prometheus.GaugeVec
ErrantReplicasVec *prometheus.GaugeVec
ProcessingTimeVec *prometheus.HistogramVec

VolumeResizedTotal *prometheus.CounterVec
VolumeResizedErrorTotal *prometheus.CounterVec
Expand Down Expand Up @@ -112,6 +113,15 @@ func Register(registry prometheus.Registerer) {
}, []string{"name", "namespace"})
registry.MustRegister(ErrantReplicasVec)

ProcessingTimeVec = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: metricsNamespace,
Subsystem: clusteringSubsystem,
Name: "processing_time_seconds",
Help: "The length of time in seconds processing the cluster",
Buckets: []float64{0.1, 0.25, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10, 20, 30},
}, []string{"name", "namespace"})
registry.MustRegister(ProcessingTimeVec)

BackupTimestamp = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Subsystem: backupSubsystem,
Expand Down