Skip to content
This repository has been archived by the owner on Sep 9, 2020. It is now read-only.

add metrics to measure autoscaler performance and health #95

Merged
merged 3 commits into from
Nov 5, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions docs/configuration/telemetry.md
Original file line number Diff line number Diff line change
Expand Up @@ -259,3 +259,58 @@ Scaling state backend metrics allow operators to get insight into how the scalin
<td>Summary</td>
</tr>
</table>

# Autoscale Metrics

Autoscale metrics allow operators to get insight into how the autoscaler is functioning.

<table class="table table-bordered table-striped">
<tr>
<th>Metric</th>
<th>Description</th>
<th>Unit</th>
<th>Type</th>
</tr>
<tr>
<td>`sherpa.autoscale.{job}.evaluation`</td>
<td>The time taken to perform the autoscaling evaluation for the job named {job}</td>
<td>Milliseconds</td>
<td>Summary</td>
</tr>
<tr>
<td>`sherpa.autoscale.evaluation.error`</td>
<td>Number of autoscaling evaluation errors across all jobs</td>
<td>Number of errors</td>
<td>Counter</td>
</tr>
<tr>
<td>`sherpa.autoscale.{job}.evaluation.error`</td>
<td>Number of autoscaling evaluation errors for the job named {job}</td>
<td>Number of errors</td>
<td>Counter</td>
</tr>
<tr>
<td>`sherpa.autoscale.trigger.error`</td>
<td>Number of autoscaling scale trigger errors across all jobs</td>
<td>Number of errors</td>
<td>Counter</td>
</tr>
<tr>
<td>`sherpa.autoscale.{job}.trigger.error`</td>
<td>Number of autoscaling scale trigger errors for the job named {job}</td>
<td>Number of errors</td>
<td>Counter</td>
</tr>
<tr>
<td>`sherpa.autoscale.trigger.success`</td>
<td>Number of autoscaling scale trigger successes across all jobs</td>
<td>Number of successes</td>
<td>Counter</td>
</tr>
<tr>
<td>`sherpa.autoscale.{job}.trigger.success`</td>
<td>Number of autoscaling scale trigger successes for the job named {job}</td>
<td>Number of successes</td>
<td>Counter</td>
</tr>
</table>
7 changes: 7 additions & 0 deletions pkg/autoscale/autoscale.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ package autoscale

import (
"strconv"
"time"

"github.com/armon/go-metrics"
nomad "github.com/hashicorp/nomad/api"
"github.com/jrasell/sherpa/pkg/helper"
"github.com/jrasell/sherpa/pkg/policy"
Expand All @@ -11,13 +13,15 @@ import (
)

func (a *AutoScale) autoscaleJob(jobID string, policies map[string]*policy.GroupScalingPolicy, t int64) {
defer metrics.MeasureSince([]string{"autoscale", jobID, "evaluation"}, time.Now())

// Create a new logger with the job in the context.
jobLogger := helper.LoggerWithJobContext(a.logger, jobID)

resourceInfo, allocs, err := a.getJobAllocations(jobID, policies)
if err != nil {
jobLogger.Error().Err(err).Msg("failed to gather allocation details for job")
sendEvaluationErrorMetrics(jobID)
return
}

Expand All @@ -32,6 +36,7 @@ func (a *AutoScale) autoscaleJob(jobID string, policies map[string]*policy.Group
resourceUsage, err := a.getJobResourceUsage(allocs)
if err != nil {
jobLogger.Error().Err(err).Msg("failed to gather job resource usage statistics")
sendEvaluationErrorMetrics(jobID)
return
}

Expand Down Expand Up @@ -104,13 +109,15 @@ func (a *AutoScale) autoscaleJob(jobID string, policies map[string]*policy.Group
resp, _, err := a.scaler.Trigger(jobID, scaleReq, state.SourceInternalAutoscaler)
if err != nil {
jobLogger.Error().Err(err).Msg("failed to trigger scaling of job")
sendTriggerErrorMetrics(jobID)
}

if resp != nil {
jobLogger.Info().
Str("id", resp.ID.String()).
Str("evaluation-id", resp.EvaluationID).
Msg("successfully triggered autoscaling of job")
sendTriggerSuccessMetrics(jobID)
}
}
}
Expand Down
24 changes: 24 additions & 0 deletions pkg/autoscale/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package autoscale

import "github.com/armon/go-metrics"

// sendEvaluationErrorMetrics is a helper to track autoscaling evaluation errors. This is done by
// tracking both overall errors, and job specific counters.
func sendEvaluationErrorMetrics(job string) {
metrics.IncrCounter([]string{"autoscale", "evaluation", "error"}, 1)
metrics.IncrCounter([]string{"autoscale", job, "evaluation", "error"}, 1)
}

// sendTriggerErrorMetrics is a helper to track autoscaling scale trigger errors. This is done by
// tracking both overall errors, and job specific counters.
func sendTriggerErrorMetrics(job string) {
metrics.IncrCounter([]string{"autoscale", "trigger", "error"}, 1)
metrics.IncrCounter([]string{"autoscale", job, "trigger", "error"}, 1)
}

// sendTriggerSuccessMetrics is a helper to track autoscaling scale trigger success. This is done
// by tracking both overall success, and job specific counters.
func sendTriggerSuccessMetrics(job string) {
metrics.IncrCounter([]string{"autoscale", "trigger", "success"}, 1)
metrics.IncrCounter([]string{"autoscale", job, "trigger", "success"}, 1)
}