Skip to content

Commit

Permalink
Merge pull request #216 from mercedes-benz/extended_metrics
Browse files Browse the repository at this point in the history
chore: refactor metrics endpoint
  • Loading branch information
gabriel-samfira authored Feb 20, 2024
2 parents f68cf98 + b1cbfac commit e108140
Show file tree
Hide file tree
Showing 28 changed files with 651 additions and 612 deletions.
45 changes: 17 additions & 28 deletions apiserver/controllers/controllers.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ import (
)

func NewAPIController(r *runner.Runner, authenticator *auth.Authenticator, hub *wsWriter.Hub) (*APIController, error) {
controllerInfo, err := r.GetControllerInfo(auth.GetAdminContext())
controllerInfo, err := r.GetControllerInfo(auth.GetAdminContext(context.Background()))
if err != nil {
return nil, errors.Wrap(err, "failed to get controller info")
}
Expand Down Expand Up @@ -95,19 +95,6 @@ func handleError(ctx context.Context, w http.ResponseWriter, err error) {
}
}

func (a *APIController) webhookMetricLabelValues(ctx context.Context, valid, reason string) []string {
controllerInfo, err := a.r.GetControllerInfo(auth.GetAdminContext())
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to get controller info")
// If labels are empty, not attempt will be made to record webhook.
return []string{}
}
return []string{
valid, reason,
controllerInfo.Hostname, controllerInfo.ControllerID.String(),
}
}

func (a *APIController) handleWorkflowJobEvent(ctx context.Context, w http.ResponseWriter, r *http.Request) {
defer r.Body.Close()
body, err := io.ReadAll(r.Body)
Expand All @@ -119,31 +106,33 @@ func (a *APIController) handleWorkflowJobEvent(ctx context.Context, w http.Respo
signature := r.Header.Get("X-Hub-Signature-256")
hookType := r.Header.Get("X-Github-Hook-Installation-Target-Type")

var labelValues []string
defer func() {
if len(labelValues) == 0 {
return
}
if err := metrics.RecordWebhookWithLabels(labelValues...); err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to record metric")
}
}()

if err := a.r.DispatchWorkflowJob(hookType, signature, body); err != nil {
if errors.Is(err, gErrors.ErrNotFound) {
labelValues = a.webhookMetricLabelValues(ctx, "false", "owner_unknown")
metrics.WebhooksReceived.WithLabelValues(
"false", // label: valid
"owner_unknown", // label: reason
).Inc()
slog.With(slog.Any("error", err)).ErrorContext(ctx, "got not found error from DispatchWorkflowJob. webhook not meant for us?")
return
} else if strings.Contains(err.Error(), "signature") { // TODO: check error type
labelValues = a.webhookMetricLabelValues(ctx, "false", "signature_invalid")
metrics.WebhooksReceived.WithLabelValues(
"false", // label: valid
"signature_invalid", // label: reason
).Inc()
} else {
labelValues = a.webhookMetricLabelValues(ctx, "false", "unknown")
metrics.WebhooksReceived.WithLabelValues(
"false", // label: valid
"unknown", // label: reason
).Inc()
}

handleError(ctx, w, err)
return
}
labelValues = a.webhookMetricLabelValues(ctx, "true", "")
metrics.WebhooksReceived.WithLabelValues(
"true", // label: valid
"", // label: reason
).Inc()
}

func (a *APIController) WebhookHandler(w http.ResponseWriter, r *http.Request) {
Expand Down
6 changes: 4 additions & 2 deletions auth/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -238,8 +238,10 @@ func UserID(ctx context.Context) string {

// GetAdminContext will return an admin context. This can be used internally
// when fetching users.
func GetAdminContext() context.Context {
ctx := context.Background()
func GetAdminContext(ctx context.Context) context.Context {
if ctx == nil {
ctx = context.Background()
}
ctx = SetUserID(ctx, "")
ctx = SetAdmin(ctx, true)
ctx = SetIsEnabled(ctx, true)
Expand Down
14 changes: 10 additions & 4 deletions cmd/garm/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import (
"github.com/cloudbase/garm/database/common"
"github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/runner"
runnerMetrics "github.com/cloudbase/garm/runner/metrics"
garmUtil "github.com/cloudbase/garm/util"
"github.com/cloudbase/garm/util/appdefaults"
"github.com/cloudbase/garm/websocket"
Expand Down Expand Up @@ -214,13 +215,18 @@ func main() {

router := routers.NewAPIRouter(controller, jwtMiddleware, initMiddleware, instanceMiddleware, cfg.Default.EnableWebhookManagement)

// start the metrics collector
if cfg.Metrics.Enable {
slog.InfoContext(ctx, "registering prometheus metrics collectors")
if err := metrics.RegisterCollectors(runner); err != nil {
log.Fatal(err)
}
slog.InfoContext(ctx, "setting up metric routes")
router = routers.WithMetricsRouter(router, cfg.Metrics.DisableAuth, metricsMiddleware)

slog.InfoContext(ctx, "register metrics")
if err := metrics.RegisterMetrics(); err != nil {
log.Fatal(err)
}

slog.InfoContext(ctx, "start metrics collection")
runnerMetrics.CollectObjectMetric(ctx, runner, cfg.Metrics.Duration())
}

if cfg.Default.DebugServer {
Expand Down
33 changes: 32 additions & 1 deletion config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -456,8 +456,39 @@ func (t *TLSConfig) Validate() error {
}

type Metrics struct {
// DisableAuth defines if the API endpoint will be protected by
// JWT authentication
DisableAuth bool `toml:"disable_auth" json:"disable-auth"`
Enable bool `toml:"enable" json:"enable"`
// Enable define if the API endpoint for metrics collection will
// be enabled
Enable bool `toml:"enable" json:"enable"`
// Period defines the internal period at which internal metrics are getting updated
// and propagated to the /metrics endpoint
Period time.Duration `toml:"period" json:"period"`
}

// ParseDuration parses the configured duration and returns a time.Duration of 0
// if the duration is invalid.
func (m *Metrics) ParseDuration() (time.Duration, error) {
duration, err := time.ParseDuration(fmt.Sprint(m.Period))
if err != nil {
return 0, err
}
return duration, nil
}

// Duration returns the configured duration or the default duration if no value
// is configured or the configured value is invalid.
func (m *Metrics) Duration() time.Duration {
duration, err := m.ParseDuration()
if err != nil {
slog.With(slog.Any("error", err)).Error(fmt.Sprintf("defined duration %s is invalid", m.Period))
}
if duration == 0 {
slog.Debug(fmt.Sprintf("using default duration %s for metrics update interval", appdefaults.DefaultMetricsUpdateInterval))
return appdefaults.DefaultMetricsUpdateInterval
}
return duration
}

// APIServer holds configuration for the API server
Expand Down
34 changes: 23 additions & 11 deletions doc/config_metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ This is one of the features in GARM that I really love having. For one thing, it

## Common metrics

| Metric name | Type | Labels | Description |
|--------------------------|---------|-------------------------------------------------------------------|------------------------------------------------------------------------------------------------------|
| `garm_health` | Gauge | `controller_id`=&lt;controller id&gt; <br>`name`=&lt;hostname&gt; | This is a gauge that is set to 1 if GARM is healthy and 0 if it is not. This is useful for alerting. |
| `garm_webhooks_received` | Counter | `controller_id`=&lt;controller id&gt; <br>`name`=&lt;hostname&gt; | This is a counter that increments every time GARM receives a webhook from GitHub. |
| Metric name | Type | Labels | Description |
|--------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------|
| `garm_health` | Gauge | `controller_id`=&lt;controller id&gt; <br>`callback_url`=&lt;callback url&gt; <br>`controller_webhook_url`=&lt;controller webhook url&gt; <br>`metadata_url`=&lt;metadata url&gt; <br>`webhook_url`=&lt;webhook url&gt; <br>`name`=&lt;hostname&gt; | This is a gauge that is set to 1 if GARM is healthy and 0 if it is not. This is useful for alerting. |
| `garm_webhooks_received` | Counter | `valid`=&lt;valid request&gt; <br>`reason`=&lt;reason for invalid requests&gt; | This is a counter that increments every time GARM receives a webhook from GitHub. |

## Enterprise metrics

Expand Down Expand Up @@ -48,9 +48,9 @@ This is one of the features in GARM that I really love having. For one thing, it

## Runner metrics

| Metric name | Type | Labels | Description |
|----------------------|-------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------|
| `garm_runner_status` | Gauge | `controller_id`=&lt;controller id&gt; <br>`hostname`=&lt;hostname&gt; <br>`name`=&lt;runner name&gt; <br>`pool_owner`=&lt;owner name&gt; <br>`pool_type`=&lt;repository\|organization\|enterprise&gt; <br>`provider`=&lt;provider name&gt; <br>`runner_status`=&lt;running\|stopped\|error\|pending_delete\|deleting\|pending_create\|creating\|unknown&gt; <br>`status`=&lt;idle\|pending\|terminated\|installing\|failed\|active&gt; <br> | This is a gauge value that gives us details about the runners garm spawns |
| Metric name | Type | Labels | Description |
|----------------------|-------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------|
| `garm_runner_status` | Gauge | `name`=&lt;runner name&gt; <br>`pool_owner`=&lt;owner name&gt; <br>`pool_type`=&lt;repository\|organization\|enterprise&gt; <br>`provider`=&lt;provider name&gt; <br>`runner_status`=&lt;running\|stopped\|error\|pending_delete\|deleting\|pending_create\|creating\|unknown&gt; <br>`status`=&lt;idle\|pending\|terminated\|installing\|failed\|active&gt; <br> | This is a gauge value that gives us details about the runners garm spawns |

More metrics will be added in the future.

Expand All @@ -60,15 +60,27 @@ Metrics are disabled by default. To enable them, add the following to your confi

```toml
[metrics]
# Toggle metrics. If set to false, the API endpoint for metrics collection will
# be disabled.
enable = true

# Toggle to disable authentication (not recommended) on the metrics endpoint.
# If you do disable authentication, I encourage you to put a reverse proxy in front
# of garm and limit which systems can access that particular endpoint. Ideally, you
# would enable some kind of authentication using the reverse proxy, if the built-in auth
# is not sufficient for your needs.
disable_auth = false
#
# Default: false
disable_auth = true

# Toggle metrics. If set to false, the API endpoint for metrics collection will
# be disabled.
#
# Default: false
enable = true

# period is the time interval when the /metrics endpoint will update internal metrics about
# controller specific objects (e.g. runners, pools, etc.)
#
# Default: "60s"
period = "30s"
```

You can choose to disable authentication if you wish, however it's not terribly difficult to set up, so I generally advise against disabling it.
Expand Down
57 changes: 14 additions & 43 deletions metrics/enterprise.go
Original file line number Diff line number Diff line change
@@ -1,50 +1,21 @@
package metrics

import (
"log/slog"
"strconv"

"github.com/cloudbase/garm/auth"
"github.com/prometheus/client_golang/prometheus"
)

// CollectOrganizationMetric collects the metrics for the enterprise objects
func (c *GarmCollector) CollectEnterpriseMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
ctx := auth.GetAdminContext()

enterprises, err := c.runner.ListEnterprises(ctx)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "listing providers")
return
}

for _, enterprise := range enterprises {
var (
EnterpriseInfo = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Subsystem: metricsEnterpriseSubsystem,
Name: "info",
Help: "Info of the enterprise",
}, []string{"name", "id"})

enterpriseInfo, err := prometheus.NewConstMetric(
c.enterpriseInfo,
prometheus.GaugeValue,
1,
enterprise.Name, // label: name
enterprise.ID, // label: id
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect enterpriseInfo metric")
continue
}
ch <- enterpriseInfo

enterprisePoolManagerStatus, err := prometheus.NewConstMetric(
c.enterprisePoolManagerStatus,
prometheus.GaugeValue,
bool2float64(enterprise.PoolManagerStatus.IsRunning),
enterprise.Name, // label: name
enterprise.ID, // label: id
strconv.FormatBool(enterprise.PoolManagerStatus.IsRunning), // label: running
)
if err != nil {
slog.With(slog.Any("error", err)).ErrorContext(ctx, "cannot collect enterprisePoolManagerStatus metric")
continue
}
ch <- enterprisePoolManagerStatus
}
}
EnterprisePoolManagerStatus = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Subsystem: metricsEnterpriseSubsystem,
Name: "pool_manager_status",
Help: "Status of the enterprise pool manager",
}, []string{"name", "id", "running"})
)
23 changes: 7 additions & 16 deletions metrics/health.go
Original file line number Diff line number Diff line change
@@ -1,22 +1,13 @@
package metrics

import (
"log/slog"

"github.com/prometheus/client_golang/prometheus"
)

func (c *GarmCollector) CollectHealthMetric(ch chan<- prometheus.Metric, hostname string, controllerID string) {
m, err := prometheus.NewConstMetric(
c.healthMetric,
prometheus.GaugeValue,
1,
hostname,
controllerID,
)
if err != nil {
slog.With(slog.Any("error", err)).Error("error on creating health metric")
return
}
ch <- m
}
var (
GarmHealth = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: metricsNamespace,
Name: "health",
Help: "Health of the garm",
}, []string{"metadata_url", "callback_url", "webhook_url", "controller_webhook_url", "controller_id"})
)
Loading

0 comments on commit e108140

Please sign in to comment.