Skip to content
This repository has been archived by the owner on Sep 9, 2020. It is now read-only.

Commit

Permalink
Merge pull request #95 from jrasell/gh-37-autoscaler
Browse files Browse the repository at this point in the history
add metrics to measure autoscaler performance and health
  • Loading branch information
jrasell authored Nov 5, 2019
2 parents 05ac7f0 + 5789fd3 commit 5a9154c
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 0 deletions.
55 changes: 55 additions & 0 deletions docs/configuration/telemetry.md
Original file line number Diff line number Diff line change
Expand Up @@ -259,3 +259,58 @@ Scaling state backend metrics allow operators to get insight into how the scalin
<td>Summary</td>
</tr>
</table>

# Autoscale Metrics

Autoscale metrics allow operators to get insight into how the autoscaler is functioning.

<table class="table table-bordered table-striped">
<tr>
<th>Metric</th>
<th>Description</th>
<th>Unit</th>
<th>Type</th>
</tr>
<tr>
<td>`sherpa.autoscale.{job}.evaluation`</td>
<td>The time taken to perform the autoscaling evaluation for the job named {job}</td>
<td>Milliseconds</td>
<td>Summary</td>
</tr>
<tr>
<td>`sherpa.autoscale.evaluation.error`</td>
<td>Number of autoscaling evaluation errors across all jobs</td>
<td>Number of errors</td>
<td>Counter</td>
</tr>
<tr>
<td>`sherpa.autoscale.{job}.evaluation.error`</td>
<td>Number of autoscaling evaluation errors for the job named {job}</td>
<td>Number of errors</td>
<td>Counter</td>
</tr>
<tr>
<td>`sherpa.autoscale.trigger.error`</td>
<td>Number of autoscaling scale trigger errors across all jobs</td>
<td>Number of errors</td>
<td>Counter</td>
</tr>
<tr>
<td>`sherpa.autoscale.{job}.trigger.error`</td>
<td>Number of autoscaling scale trigger errors for the job named {job}</td>
<td>Number of errors</td>
<td>Counter</td>
</tr>
<tr>
<td>`sherpa.autoscale.trigger.success`</td>
<td>Number of autoscaling scale trigger successes across all jobs</td>
<td>Number of successes</td>
<td>Counter</td>
</tr>
<tr>
<td>`sherpa.autoscale.{job}.trigger.success`</td>
<td>Number of autoscaling scale trigger successes for the job named {job}</td>
<td>Number of successes</td>
<td>Counter</td>
</tr>
</table>
7 changes: 7 additions & 0 deletions pkg/autoscale/autoscale.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ package autoscale

import (
"strconv"
"time"

"github.com/armon/go-metrics"
nomad "github.com/hashicorp/nomad/api"
"github.com/jrasell/sherpa/pkg/helper"
"github.com/jrasell/sherpa/pkg/policy"
Expand All @@ -11,13 +13,15 @@ import (
)

func (a *AutoScale) autoscaleJob(jobID string, policies map[string]*policy.GroupScalingPolicy, t int64) {
defer metrics.MeasureSince([]string{"autoscale", jobID, "evaluation"}, time.Now())

// Create a new logger with the job in the context.
jobLogger := helper.LoggerWithJobContext(a.logger, jobID)

resourceInfo, allocs, err := a.getJobAllocations(jobID, policies)
if err != nil {
jobLogger.Error().Err(err).Msg("failed to gather allocation details for job")
sendEvaluationErrorMetrics(jobID)
return
}

Expand All @@ -32,6 +36,7 @@ func (a *AutoScale) autoscaleJob(jobID string, policies map[string]*policy.Group
resourceUsage, err := a.getJobResourceUsage(allocs)
if err != nil {
jobLogger.Error().Err(err).Msg("failed to gather job resource usage statistics")
sendEvaluationErrorMetrics(jobID)
return
}

Expand Down Expand Up @@ -104,13 +109,15 @@ func (a *AutoScale) autoscaleJob(jobID string, policies map[string]*policy.Group
resp, _, err := a.scaler.Trigger(jobID, scaleReq, state.SourceInternalAutoscaler)
if err != nil {
jobLogger.Error().Err(err).Msg("failed to trigger scaling of job")
sendTriggerErrorMetrics(jobID)
}

if resp != nil {
jobLogger.Info().
Str("id", resp.ID.String()).
Str("evaluation-id", resp.EvaluationID).
Msg("successfully triggered autoscaling of job")
sendTriggerSuccessMetrics(jobID)
}
}
}
Expand Down
24 changes: 24 additions & 0 deletions pkg/autoscale/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package autoscale

import "github.com/armon/go-metrics"

// sendEvaluationErrorMetrics is a helper to track autoscaling evaluation errors. This is done by
// tracking both overall errors, and job specific counters.
func sendEvaluationErrorMetrics(job string) {
metrics.IncrCounter([]string{"autoscale", "evaluation", "error"}, 1)
metrics.IncrCounter([]string{"autoscale", job, "evaluation", "error"}, 1)
}

// sendTriggerErrorMetrics is a helper to track autoscaling scale trigger errors. This is done by
// tracking both overall errors, and job specific counters.
func sendTriggerErrorMetrics(job string) {
metrics.IncrCounter([]string{"autoscale", "trigger", "error"}, 1)
metrics.IncrCounter([]string{"autoscale", job, "trigger", "error"}, 1)
}

// sendTriggerSuccessMetrics is a helper to track autoscaling scale trigger success. This is done
// by tracking both overall success, and job specific counters.
func sendTriggerSuccessMetrics(job string) {
metrics.IncrCounter([]string{"autoscale", "trigger", "success"}, 1)
metrics.IncrCounter([]string{"autoscale", job, "trigger", "success"}, 1)
}

0 comments on commit 5a9154c

Please sign in to comment.