jrasell · jrasell · Nov 5, 2019 · Nov 4, 2019 · Nov 4, 2019 · Nov 5, 2019
diff --git a/docs/configuration/telemetry.md b/docs/configuration/telemetry.md
@@ -259,3 +259,58 @@ Scaling state backend metrics allow operators to get insight into how the scalin
     <td>Summary</td>
   </tr>
 </table>
+
+# Autoscale Metrics
+
+Autoscale metrics allow operators to get insight into how the autoscaler is functioning.
+
+<table class="table table-bordered table-striped">
+  <tr>
+    <th>Metric</th>
+    <th>Description</th>
+    <th>Unit</th>
+    <th>Type</th>
+  </tr>
+  <tr>
+    <td>`sherpa.autoscale.{job}.evaluation`</td>
+    <td>The time taken to perform the autoscaling evaluation for the job named {job}</td>
+    <td>Milliseconds</td>
+    <td>Summary</td>
+  </tr>
+  <tr>
+    <td>`sherpa.autoscale.evaluation.error`</td>
+    <td>Number of autoscaling evaluation errors across all jobs</td>
+    <td>Number of errors</td>
+    <td>Counter</td>
+  </tr>
+  <tr>
+    <td>`sherpa.autoscale.{job}.evaluation.error`</td>
+    <td>Number of autoscaling evaluation errors for the job named {job}</td>
+    <td>Number of errors</td>
+    <td>Counter</td>
+  </tr>
+  <tr>
+    <td>`sherpa.autoscale.trigger.error`</td>
+    <td>Number of autoscaling scale trigger errors across all jobs</td>
+    <td>Number of errors</td>
+    <td>Counter</td>
+  </tr>
+  <tr>
+    <td>`sherpa.autoscale.{job}.trigger.error`</td>
+    <td>Number of autoscaling scale trigger errors for the job named {job}</td>
+    <td>Number of errors</td>
+    <td>Counter</td>
+  </tr>
+  <tr>
+    <td>`sherpa.autoscale.trigger.success`</td>
+    <td>Number of autoscaling scale trigger successes across all jobs</td>
+    <td>Number of successes</td>
+    <td>Counter</td>
+  </tr>
+  <tr>
+    <td>`sherpa.autoscale.{job}.trigger.success`</td>
+    <td>Number of autoscaling scale trigger successes for the job named {job}</td>
+    <td>Number of successes</td>
+    <td>Counter</td>
+  </tr>
+</table>
diff --git a/pkg/autoscale/autoscale.go b/pkg/autoscale/autoscale.go
@@ -2,7 +2,9 @@ package autoscale
 
 import (
 	"strconv"
+	"time"
 
+	"github.com/armon/go-metrics"
 	nomad "github.com/hashicorp/nomad/api"
 	"github.com/jrasell/sherpa/pkg/helper"
 	"github.com/jrasell/sherpa/pkg/policy"
@@ -11,13 +13,15 @@ import (
 )
 
 func (a *AutoScale) autoscaleJob(jobID string, policies map[string]*policy.GroupScalingPolicy, t int64) {
+	defer metrics.MeasureSince([]string{"autoscale", jobID, "evaluation"}, time.Now())
 
 	// Create a new logger with the job in the context.
 	jobLogger := helper.LoggerWithJobContext(a.logger, jobID)
 
 	resourceInfo, allocs, err := a.getJobAllocations(jobID, policies)
 	if err != nil {
 		jobLogger.Error().Err(err).Msg("failed to gather allocation details for job")
+		sendEvaluationErrorMetrics(jobID)
 		return
 	}
 
@@ -32,6 +36,7 @@ func (a *AutoScale) autoscaleJob(jobID string, policies map[string]*policy.Group
 	resourceUsage, err := a.getJobResourceUsage(allocs)
 	if err != nil {
 		jobLogger.Error().Err(err).Msg("failed to gather job resource usage statistics")
+		sendEvaluationErrorMetrics(jobID)
 		return
 	}
 
@@ -104,13 +109,15 @@ func (a *AutoScale) autoscaleJob(jobID string, policies map[string]*policy.Group
 		resp, _, err := a.scaler.Trigger(jobID, scaleReq, state.SourceInternalAutoscaler)
 		if err != nil {
 			jobLogger.Error().Err(err).Msg("failed to trigger scaling of job")
+			sendTriggerErrorMetrics(jobID)
 		}
 
 		if resp != nil {
 			jobLogger.Info().
 				Str("id", resp.ID.String()).
 				Str("evaluation-id", resp.EvaluationID).
 				Msg("successfully triggered autoscaling of job")
+			sendTriggerSuccessMetrics(jobID)
 		}
 	}
 }

diff --git a/pkg/autoscale/metrics.go b/pkg/autoscale/metrics.go
@@ -0,0 +1,24 @@
+package autoscale
+
+import "github.com/armon/go-metrics"
+
+// sendEvaluationErrorMetrics is a helper to track autoscaling evaluation errors. This is done by
+// tracking both overall errors, and job specific counters.
+func sendEvaluationErrorMetrics(job string) {
+	metrics.IncrCounter([]string{"autoscale", "evaluation", "error"}, 1)
+	metrics.IncrCounter([]string{"autoscale", job, "evaluation", "error"}, 1)
+}
+
+// sendTriggerErrorMetrics is a helper to track autoscaling scale trigger errors. This is done by
+// tracking both overall errors, and job specific counters.
+func sendTriggerErrorMetrics(job string) {
+	metrics.IncrCounter([]string{"autoscale", "trigger", "error"}, 1)
+	metrics.IncrCounter([]string{"autoscale", job, "trigger", "error"}, 1)
+}
+
+// sendTriggerSuccessMetrics is a helper to track autoscaling scale trigger success. This is done
+// by tracking both overall success, and job specific counters.
+func sendTriggerSuccessMetrics(job string) {
+	metrics.IncrCounter([]string{"autoscale", "trigger", "success"}, 1)
+	metrics.IncrCounter([]string{"autoscale", job, "trigger", "success"}, 1)
+}