From e4aff64a785413cc690601b46b43eb6bc57f5a8f Mon Sep 17 00:00:00 2001 From: James Rasell Date: Mon, 4 Nov 2019 16:47:27 +0100 Subject: [PATCH 1/2] autoscaler: add metrics to track errors, successes and latencies. --- pkg/autoscale/autoscale.go | 7 +++++++ pkg/autoscale/metrics.go | 24 ++++++++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 pkg/autoscale/metrics.go diff --git a/pkg/autoscale/autoscale.go b/pkg/autoscale/autoscale.go index 7a13e04..9e0fcf7 100644 --- a/pkg/autoscale/autoscale.go +++ b/pkg/autoscale/autoscale.go @@ -2,7 +2,9 @@ package autoscale import ( "strconv" + "time" + "github.com/armon/go-metrics" nomad "github.com/hashicorp/nomad/api" "github.com/jrasell/sherpa/pkg/helper" "github.com/jrasell/sherpa/pkg/policy" @@ -11,6 +13,7 @@ import ( ) func (a *AutoScale) autoscaleJob(jobID string, policies map[string]*policy.GroupScalingPolicy, t int64) { + defer metrics.MeasureSince([]string{"autoscale", jobID, "evaluation"}, time.Now()) // Create a new logger with the job in the context. jobLogger := helper.LoggerWithJobContext(a.logger, jobID) @@ -18,6 +21,7 @@ func (a *AutoScale) autoscaleJob(jobID string, policies map[string]*policy.Group resourceInfo, allocs, err := a.getJobAllocations(jobID, policies) if err != nil { jobLogger.Error().Err(err).Msg("failed to gather allocation details for job") + sendEvaluationErrorMetrics(jobID) return } @@ -32,6 +36,7 @@ func (a *AutoScale) autoscaleJob(jobID string, policies map[string]*policy.Group resourceUsage, err := a.getJobResourceUsage(allocs) if err != nil { jobLogger.Error().Err(err).Msg("failed to gather job resource usage statistics") + sendEvaluationErrorMetrics(jobID) return } @@ -104,6 +109,7 @@ func (a *AutoScale) autoscaleJob(jobID string, policies map[string]*policy.Group resp, _, err := a.scaler.Trigger(jobID, scaleReq, state.SourceInternalAutoscaler) if err != nil { jobLogger.Error().Err(err).Msg("failed to trigger scaling of job") + sendTriggerErrorMetrics(jobID) } if resp != nil { @@ -111,6 +117,7 @@ func (a *AutoScale) autoscaleJob(jobID string, policies map[string]*policy.Group Str("id", resp.ID.String()). Str("evaluation-id", resp.EvaluationID). Msg("successfully triggered autoscaling of job") + sendTriggerSuccessMetrics(jobID) } } } diff --git a/pkg/autoscale/metrics.go b/pkg/autoscale/metrics.go new file mode 100644 index 0000000..25708ed --- /dev/null +++ b/pkg/autoscale/metrics.go @@ -0,0 +1,24 @@ +package autoscale + +import "github.com/armon/go-metrics" + +// sendEvaluationErrorMetrics is a helper to track autoscaling evaluation errors. This is done by +// tracking both overall errors, and job specific counters. +func sendEvaluationErrorMetrics(job string) { + metrics.IncrCounter([]string{"autoscale", "evaluation", "error"}, 1) + metrics.IncrCounter([]string{"autoscale", job, "evaluation", "error"}, 1) +} + +// sendTriggerErrorMetrics is a helper to track autoscaling scale trigger errors. This is done by +// tracking both overall errors, and job specific counters. +func sendTriggerErrorMetrics(job string) { + metrics.IncrCounter([]string{"autoscale", "trigger", "error"}, 1) + metrics.IncrCounter([]string{"autoscale", job, "trigger", "error"}, 1) +} + +// sendTriggerSuccessMetrics is a helper to track autoscaling scale trigger success. This is done +// by tracking both overall success, and job specific counters. +func sendTriggerSuccessMetrics(job string) { + metrics.IncrCounter([]string{"autoscale", "trigger", "success"}, 1) + metrics.IncrCounter([]string{"autoscale", job, "trigger", "success"}, 1) +} From 4efaba7a65cfc2d9fb402d6f74500c664e4953ff Mon Sep 17 00:00:00 2001 From: James Rasell Date: Mon, 4 Nov 2019 16:48:09 +0100 Subject: [PATCH 2/2] docs: add details of available autoscaler metrics. --- docs/configuration/telemetry.md | 55 +++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/docs/configuration/telemetry.md b/docs/configuration/telemetry.md index 4383b56..976b851 100644 --- a/docs/configuration/telemetry.md +++ b/docs/configuration/telemetry.md @@ -76,3 +76,58 @@ Runtime metrics allow operators to get insight into how the Sherpa server proces Gauge + +# Autoscale Metrics + +Autoscale metrics allow operators to get insight into how the autoscaler is functioning. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
MetricDescriptionUnitType
`sherpa.autoscale.{job}.evaluation`The time taken to perform the autoscaling evaluation for the job named {job}MillisecondsSummary
`sherpa.autoscale.evaluation.error`Number of autoscaling evaluation errors across all jobsNumber of errorsCounter
`sherpa.autoscale.{job}.evaluation.error`Number of autoscaling evaluation errors for the job named {job}Number of errorsCounter
`sherpa.autoscale.trigger.error`Number of autoscaling scale trigger errors across all jobsNumber of errorsCounter
`sherpa.autoscale.{job}.trigger.error`Number of autoscaling scale trigger errors for the job named {job}Number of errorsCounter
`sherpa.autoscale.trigger.success`Number of autoscaling scale trigger successes across all jobsNumber of successesCounter
`sherpa.autoscale.{job}.trigger.success`Number of autoscaling scale trigger successes for the job named {job}Number of successesCounter