diff --git a/docs/configuration/telemetry.md b/docs/configuration/telemetry.md
index 1003c46..a247d34 100644
--- a/docs/configuration/telemetry.md
+++ b/docs/configuration/telemetry.md
@@ -259,3 +259,58 @@ Scaling state backend metrics allow operators to get insight into how the scalin
+
+ Metric |
+ Description |
+ Unit |
+ Type |
+
+
+ `sherpa.autoscale.{job}.evaluation` |
+ The time taken to perform the autoscaling evaluation for the job named {job} |
+ Milliseconds |
+ Summary |
+
+
+ `sherpa.autoscale.evaluation.error` |
+ Number of autoscaling evaluation errors across all jobs |
+ Number of errors |
+ Counter |
+
+
+ `sherpa.autoscale.{job}.evaluation.error` |
+ Number of autoscaling evaluation errors for the job named {job} |
+ Number of errors |
+ Counter |
+
+
+ `sherpa.autoscale.trigger.error` |
+ Number of autoscaling scale trigger errors across all jobs |
+ Number of errors |
+ Counter |
+
+
+ `sherpa.autoscale.{job}.trigger.error` |
+ Number of autoscaling scale trigger errors for the job named {job} |
+ Number of errors |
+ Counter |
+
+
+ `sherpa.autoscale.trigger.success` |
+ Number of autoscaling scale trigger successes across all jobs |
+ Number of successes |
+ Counter |
+
+
+ `sherpa.autoscale.{job}.trigger.success` |
+ Number of autoscaling scale trigger successes for the job named {job} |
+ Number of successes |
+ Counter |
+
+
diff --git a/pkg/autoscale/autoscale.go b/pkg/autoscale/autoscale.go
index 7a13e04..9e0fcf7 100644
--- a/pkg/autoscale/autoscale.go
+++ b/pkg/autoscale/autoscale.go
@@ -2,7 +2,9 @@ package autoscale
import (
"strconv"
+ "time"
+ "github.com/armon/go-metrics"
nomad "github.com/hashicorp/nomad/api"
"github.com/jrasell/sherpa/pkg/helper"
"github.com/jrasell/sherpa/pkg/policy"
@@ -11,6 +13,7 @@ import (
)
func (a *AutoScale) autoscaleJob(jobID string, policies map[string]*policy.GroupScalingPolicy, t int64) {
+ defer metrics.MeasureSince([]string{"autoscale", jobID, "evaluation"}, time.Now())
// Create a new logger with the job in the context.
jobLogger := helper.LoggerWithJobContext(a.logger, jobID)
@@ -18,6 +21,7 @@ func (a *AutoScale) autoscaleJob(jobID string, policies map[string]*policy.Group
resourceInfo, allocs, err := a.getJobAllocations(jobID, policies)
if err != nil {
jobLogger.Error().Err(err).Msg("failed to gather allocation details for job")
+ sendEvaluationErrorMetrics(jobID)
return
}
@@ -32,6 +36,7 @@ func (a *AutoScale) autoscaleJob(jobID string, policies map[string]*policy.Group
resourceUsage, err := a.getJobResourceUsage(allocs)
if err != nil {
jobLogger.Error().Err(err).Msg("failed to gather job resource usage statistics")
+ sendEvaluationErrorMetrics(jobID)
return
}
@@ -104,6 +109,7 @@ func (a *AutoScale) autoscaleJob(jobID string, policies map[string]*policy.Group
resp, _, err := a.scaler.Trigger(jobID, scaleReq, state.SourceInternalAutoscaler)
if err != nil {
jobLogger.Error().Err(err).Msg("failed to trigger scaling of job")
+ sendTriggerErrorMetrics(jobID)
}
if resp != nil {
@@ -111,6 +117,7 @@ func (a *AutoScale) autoscaleJob(jobID string, policies map[string]*policy.Group
Str("id", resp.ID.String()).
Str("evaluation-id", resp.EvaluationID).
Msg("successfully triggered autoscaling of job")
+ sendTriggerSuccessMetrics(jobID)
}
}
}
diff --git a/pkg/autoscale/metrics.go b/pkg/autoscale/metrics.go
new file mode 100644
index 0000000..25708ed
--- /dev/null
+++ b/pkg/autoscale/metrics.go
@@ -0,0 +1,24 @@
+package autoscale
+
+import "github.com/armon/go-metrics"
+
+// sendEvaluationErrorMetrics is a helper to track autoscaling evaluation errors. This is done by
+// tracking both overall errors, and job specific counters.
+func sendEvaluationErrorMetrics(job string) {
+ metrics.IncrCounter([]string{"autoscale", "evaluation", "error"}, 1)
+ metrics.IncrCounter([]string{"autoscale", job, "evaluation", "error"}, 1)
+}
+
+// sendTriggerErrorMetrics is a helper to track autoscaling scale trigger errors. This is done by
+// tracking both overall errors, and job specific counters.
+func sendTriggerErrorMetrics(job string) {
+ metrics.IncrCounter([]string{"autoscale", "trigger", "error"}, 1)
+ metrics.IncrCounter([]string{"autoscale", job, "trigger", "error"}, 1)
+}
+
+// sendTriggerSuccessMetrics is a helper to track autoscaling scale trigger success. This is done
+// by tracking both overall success, and job specific counters.
+func sendTriggerSuccessMetrics(job string) {
+ metrics.IncrCounter([]string{"autoscale", "trigger", "success"}, 1)
+ metrics.IncrCounter([]string{"autoscale", job, "trigger", "success"}, 1)
+}