Skip to content

Commit

Permalink
Merge pull request #217 from mercedes-benz/add_counter_metrics
Browse files Browse the repository at this point in the history
extend metrics for github and provider executions
  • Loading branch information
gabriel-samfira authored Feb 22, 2024
2 parents e108140 + d68cc3b commit dd6f1e4
Show file tree
Hide file tree
Showing 9 changed files with 429 additions and 10 deletions.
17 changes: 12 additions & 5 deletions doc/config_metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,18 @@ This is one of the features in GARM that I really love having. For one thing, it

## Runner metrics

| Metric name | Type | Labels | Description |
|----------------------|-------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------|
| `garm_runner_status` | Gauge | `name`=&lt;runner name&gt; <br>`pool_owner`=&lt;owner name&gt; <br>`pool_type`=&lt;repository\|organization\|enterprise&gt; <br>`provider`=&lt;provider name&gt; <br>`runner_status`=&lt;running\|stopped\|error\|pending_delete\|deleting\|pending_create\|creating\|unknown&gt; <br>`status`=&lt;idle\|pending\|terminated\|installing\|failed\|active&gt; <br> | This is a gauge value that gives us details about the runners garm spawns |

More metrics will be added in the future.
| Metric name | Type | Labels | Description |
|--------------------------------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------|
| `garm_runner_status` | Gauge | `name`=&lt;runner name&gt; <br>`pool_owner`=&lt;owner name&gt; <br>`pool_type`=&lt;repository\|organization\|enterprise&gt; <br>`provider`=&lt;provider name&gt; <br>`runner_status`=&lt;running\|stopped\|error\|pending_delete\|deleting\|pending_create\|creating\|unknown&gt; <br>`status`=&lt;idle\|pending\|terminated\|installing\|failed\|active&gt; <br> | This is a gauge value that gives us details about the runners garm spawns |
| `garm_runner_operations_total` | Counter | `provider`=&lt;provider name&gt; <br>`operation`=&lt;CreateInstance\|DeleteInstance\|GetInstance\|ListInstances\|RemoveAllInstances\|Start\Stop&gt; | This is a counter that increments every time a runner operation is performed |
| `garm_runner_errors_total` | Counter | `provider`=&lt;provider name&gt; <br>`operation`=&lt;CreateInstance\|DeleteInstance\|GetInstance\|ListInstances\|RemoveAllInstances\|Start\Stop&gt; | This is a counter that increments every time a runner operation errored |

## Github metrics

| Metric name | Type | Labels | Description |
|--------------------------------|---------|------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------|
| `garm_github_operations_total` | Counter | `operation`=&lt;ListRunners\|CreateRegistrationToken\|...&gt; <br>`scope`=&lt;Organization\|Repository\|Enterprise&gt; | This is a counter that increments every time a github operation is performed |
| `garm_github_errors_total` | Counter | `operation`=&lt;ListRunners\|CreateRegistrationToken\|...&gt; <br>`scope`=&lt;Organization\|Repository\|Enterprise&gt; | This is a counter that increments every time a github operation errored |

## Enabling metrics

Expand Down
19 changes: 19 additions & 0 deletions metrics/github.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package metrics

import "github.com/prometheus/client_golang/prometheus"

var (
GithubOperationCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsGithubSubsystem,
Name: "operations_total",
Help: "Total number of github operation attempts",
}, []string{"operation", "scope"})

GithubOperationFailedCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsGithubSubsystem,
Name: "errors_total",
Help: "Total number of failed github operation attempts",
}, []string{"operation", "scope"})
)
14 changes: 14 additions & 0 deletions metrics/instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,18 @@ var (
Name: "status",
Help: "Status of the instance",
}, []string{"name", "status", "runner_status", "pool_owner", "pool_type", "pool_id", "provider"})

InstanceOperationCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsRunnerSubsystem,
Name: "operations_total",
Help: "Total number of instance operation attempts",
}, []string{"operation", "provider"})

InstanceOperationFailedCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsRunnerSubsystem,
Name: "errors_total",
Help: "Total number of failed instance operation attempts",
}, []string{"operation", "provider"})
)
15 changes: 14 additions & 1 deletion metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,15 @@ const metricsOrganizationSubsystem = "organization"
const metricsRepositorySubsystem = "repository"
const metricsEnterpriseSubsystem = "enterprise"
const metricsWebhookSubsystem = "webhook"
const metricsGithubSubsystem = "github"

// RegisterMetrics registers all the metrics
func RegisterMetrics() error {

var collectors []prometheus.Collector
collectors = append(collectors,

// metrics created during the periodically update of the metrics
//
// runner metrics
InstanceStatus,
// organization metrics
Expand All @@ -39,6 +42,16 @@ func RegisterMetrics() error {
PoolBootstrapTimeout,
// health metrics
GarmHealth,

// metrics used within normal garm operations
// e.g. count instance creations, count github api calls, ...
//
// runner instances
InstanceOperationCount,
InstanceOperationFailedCount,
// github
GithubOperationCount,
GithubOperationFailedCount,
// webhook metrics
WebhooksReceived,
)
Expand Down
74 changes: 73 additions & 1 deletion runner/pool/enterprise.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
runnerErrors "github.com/cloudbase/garm-provider-common/errors"
commonParams "github.com/cloudbase/garm-provider-common/params"
dbCommon "github.com/cloudbase/garm/database/common"
"github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/params"
"github.com/cloudbase/garm/runner/common"
"github.com/cloudbase/garm/util"
Expand Down Expand Up @@ -85,8 +86,16 @@ func (r *enterprise) findRunnerGroupByName(ctx context.Context, name string) (*g
}

for {
metrics.GithubOperationCount.WithLabelValues(
"ListOrganizationRunnerGroups", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
runnerGroups, ghResp, err := r.ghcEnterpriseCli.ListRunnerGroups(r.ctx, r.cfg.Name, &opts)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"ListOrganizationRunnerGroups", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return nil, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching runners")
}
Expand Down Expand Up @@ -123,8 +132,16 @@ func (r *enterprise) GetJITConfig(ctx context.Context, instance string, pool par
// TODO(gabriel-samfira): Should we make this configurable?
WorkFolder: github.String("_work"),
}
metrics.GithubOperationCount.WithLabelValues(
"GenerateEnterpriseJITConfig", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
jitConfig, resp, err := r.ghcEnterpriseCli.GenerateEnterpriseJITConfig(ctx, r.cfg.Name, &req)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"GenerateEnterpriseJITConfig", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
if resp != nil && resp.StatusCode == http.StatusUnauthorized {
return nil, nil, fmt.Errorf("failed to get JIT config: %w", err)
}
Expand All @@ -134,7 +151,17 @@ func (r *enterprise) GetJITConfig(ctx context.Context, instance string, pool par
runner = jitConfig.Runner
defer func() {
if err != nil && runner != nil {
metrics.GithubOperationCount.WithLabelValues(
"RemoveRunner", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
_, innerErr := r.ghcEnterpriseCli.RemoveRunner(r.ctx, r.cfg.Name, runner.GetID())
if innerErr != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"RemoveRunner", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
}
slog.With(slog.Any("error", innerErr)).ErrorContext(
ctx, "failed to remove runner",
"runner_id", runner.GetID(), "organization", r.cfg.Name)
Expand Down Expand Up @@ -166,8 +193,16 @@ func (r *enterprise) GetRunnerInfoFromWorkflow(job params.WorkflowJob) (params.R
if err := r.ValidateOwner(job); err != nil {
return params.RunnerInfo{}, errors.Wrap(err, "validating owner")
}
metrics.GithubOperationCount.WithLabelValues(
"GetWorkflowJobByID", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
workflow, ghResp, err := r.ghcli.GetWorkflowJobByID(r.ctx, job.Repository.Owner.Login, job.Repository.Name, job.WorkflowJob.ID)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"GetWorkflowJobByID", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return params.RunnerInfo{}, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching workflow info")
}
Expand Down Expand Up @@ -212,8 +247,16 @@ func (r *enterprise) GetGithubRunners() ([]*github.Runner, error) {

var allRunners []*github.Runner
for {
metrics.GithubOperationCount.WithLabelValues(
"ListRunners", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
runners, ghResp, err := r.ghcEnterpriseCli.ListRunners(r.ctx, r.cfg.Name, &opts)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"ListRunners", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return nil, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching runners")
}
Expand All @@ -231,8 +274,16 @@ func (r *enterprise) GetGithubRunners() ([]*github.Runner, error) {
func (r *enterprise) FetchTools() ([]commonParams.RunnerApplicationDownload, error) {
r.mux.Lock()
defer r.mux.Unlock()
metrics.GithubOperationCount.WithLabelValues(
"ListRunnerApplicationDownloads", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
tools, ghResp, err := r.ghcEnterpriseCli.ListRunnerApplicationDownloads(r.ctx, r.cfg.Name)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"ListRunnerApplicationDownloads", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return nil, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching runners")
}
Expand All @@ -255,7 +306,19 @@ func (r *enterprise) FetchDbInstances() ([]params.Instance, error) {
}

func (r *enterprise) RemoveGithubRunner(runnerID int64) (*github.Response, error) {
return r.ghcEnterpriseCli.RemoveRunner(r.ctx, r.cfg.Name, runnerID)
metrics.GithubOperationCount.WithLabelValues(
"RemoveRunner", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
ghResp, err := r.ghcEnterpriseCli.RemoveRunner(r.ctx, r.cfg.Name, runnerID)
if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"RemoveRunner", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
return nil, err
}
return ghResp, nil
}

func (r *enterprise) ListPools() ([]params.Pool, error) {
Expand All @@ -275,9 +338,18 @@ func (r *enterprise) JwtToken() string {
}

func (r *enterprise) GetGithubRegistrationToken() (string, error) {
metrics.GithubOperationCount.WithLabelValues(
"CreateRegistrationToken", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()

tk, ghResp, err := r.ghcEnterpriseCli.CreateRegistrationToken(r.ctx, r.cfg.Name)

if err != nil {
metrics.GithubOperationFailedCount.WithLabelValues(
"CreateRegistrationToken", // label: operation
metricsLabelEnterpriseScope, // label: scope
).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return "", errors.Wrap(runnerErrors.ErrUnauthorized, "fetching registration token")
}
Expand Down
Loading

0 comments on commit dd6f1e4

Please sign in to comment.