diff --git a/doc/config_metrics.md b/doc/config_metrics.md
index 17161616..7e5318e3 100644
--- a/doc/config_metrics.md
+++ b/doc/config_metrics.md
@@ -48,11 +48,18 @@ This is one of the features in GARM that I really love having. For one thing, it
## Runner metrics
-| Metric name | Type | Labels | Description |
-|----------------------|-------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------|
-| `garm_runner_status` | Gauge | `name`=<runner name>
`pool_owner`=<owner name>
`pool_type`=<repository\|organization\|enterprise>
`provider`=<provider name>
`runner_status`=<running\|stopped\|error\|pending_delete\|deleting\|pending_create\|creating\|unknown>
`status`=<idle\|pending\|terminated\|installing\|failed\|active>
| This is a gauge value that gives us details about the runners garm spawns |
-
-More metrics will be added in the future.
+| Metric name | Type | Labels | Description |
+|--------------------------------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------|
+| `garm_runner_status` | Gauge | `name`=<runner name>
`pool_owner`=<owner name>
`pool_type`=<repository\|organization\|enterprise>
`provider`=<provider name>
`runner_status`=<running\|stopped\|error\|pending_delete\|deleting\|pending_create\|creating\|unknown>
`status`=<idle\|pending\|terminated\|installing\|failed\|active>
| This is a gauge value that gives us details about the runners garm spawns |
+| `garm_runner_operations_total` | Counter | `provider`=<provider name>
`operation`=<CreateInstance\|DeleteInstance\|GetInstance\|ListInstances\|RemoveAllInstances\|Start\Stop> | This is a counter that increments every time a runner operation is performed |
+| `garm_runner_errors_total` | Counter | `provider`=<provider name>
`operation`=<CreateInstance\|DeleteInstance\|GetInstance\|ListInstances\|RemoveAllInstances\|Start\Stop> | This is a counter that increments every time a runner operation errored |
+
+## Github metrics
+
+| Metric name | Type | Labels | Description |
+|--------------------------------|---------|------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------|
+| `garm_github_operations_total` | Counter | `operation`=<ListRunners\|CreateRegistrationToken\|...>
`scope`=<Organization\|Repository\|Enterprise> | This is a counter that increments every time a github operation is performed |
+| `garm_github_errors_total` | Counter | `operation`=<ListRunners\|CreateRegistrationToken\|...>
`scope`=<Organization\|Repository\|Enterprise> | This is a counter that increments every time a github operation errored |
## Enabling metrics
diff --git a/metrics/github.go b/metrics/github.go
new file mode 100644
index 00000000..0c050652
--- /dev/null
+++ b/metrics/github.go
@@ -0,0 +1,19 @@
+package metrics
+
+import "github.com/prometheus/client_golang/prometheus"
+
+var (
+ GithubOperationCount = prometheus.NewCounterVec(prometheus.CounterOpts{
+ Namespace: metricsNamespace,
+ Subsystem: metricsGithubSubsystem,
+ Name: "operations_total",
+ Help: "Total number of github operation attempts",
+ }, []string{"operation", "scope"})
+
+ GithubOperationFailedCount = prometheus.NewCounterVec(prometheus.CounterOpts{
+ Namespace: metricsNamespace,
+ Subsystem: metricsGithubSubsystem,
+ Name: "errors_total",
+ Help: "Total number of failed github operation attempts",
+ }, []string{"operation", "scope"})
+)
diff --git a/metrics/instance.go b/metrics/instance.go
index a4ac66bf..7c2f2f96 100644
--- a/metrics/instance.go
+++ b/metrics/instance.go
@@ -11,4 +11,18 @@ var (
Name: "status",
Help: "Status of the instance",
}, []string{"name", "status", "runner_status", "pool_owner", "pool_type", "pool_id", "provider"})
+
+ InstanceOperationCount = prometheus.NewCounterVec(prometheus.CounterOpts{
+ Namespace: metricsNamespace,
+ Subsystem: metricsRunnerSubsystem,
+ Name: "operations_total",
+ Help: "Total number of instance operation attempts",
+ }, []string{"operation", "provider"})
+
+ InstanceOperationFailedCount = prometheus.NewCounterVec(prometheus.CounterOpts{
+ Namespace: metricsNamespace,
+ Subsystem: metricsRunnerSubsystem,
+ Name: "errors_total",
+ Help: "Total number of failed instance operation attempts",
+ }, []string{"operation", "provider"})
)
diff --git a/metrics/metrics.go b/metrics/metrics.go
index 7dec272c..44b75031 100644
--- a/metrics/metrics.go
+++ b/metrics/metrics.go
@@ -12,12 +12,15 @@ const metricsOrganizationSubsystem = "organization"
const metricsRepositorySubsystem = "repository"
const metricsEnterpriseSubsystem = "enterprise"
const metricsWebhookSubsystem = "webhook"
+const metricsGithubSubsystem = "github"
// RegisterMetrics registers all the metrics
func RegisterMetrics() error {
-
var collectors []prometheus.Collector
collectors = append(collectors,
+
+ // metrics created during the periodically update of the metrics
+ //
// runner metrics
InstanceStatus,
// organization metrics
@@ -39,6 +42,16 @@ func RegisterMetrics() error {
PoolBootstrapTimeout,
// health metrics
GarmHealth,
+
+ // metrics used within normal garm operations
+ // e.g. count instance creations, count github api calls, ...
+ //
+ // runner instances
+ InstanceOperationCount,
+ InstanceOperationFailedCount,
+ // github
+ GithubOperationCount,
+ GithubOperationFailedCount,
// webhook metrics
WebhooksReceived,
)
diff --git a/runner/pool/enterprise.go b/runner/pool/enterprise.go
index f03947b5..e71f04d4 100644
--- a/runner/pool/enterprise.go
+++ b/runner/pool/enterprise.go
@@ -13,6 +13,7 @@ import (
runnerErrors "github.com/cloudbase/garm-provider-common/errors"
commonParams "github.com/cloudbase/garm-provider-common/params"
dbCommon "github.com/cloudbase/garm/database/common"
+ "github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/params"
"github.com/cloudbase/garm/runner/common"
"github.com/cloudbase/garm/util"
@@ -85,8 +86,16 @@ func (r *enterprise) findRunnerGroupByName(ctx context.Context, name string) (*g
}
for {
+ metrics.GithubOperationCount.WithLabelValues(
+ "ListOrganizationRunnerGroups", // label: operation
+ metricsLabelEnterpriseScope, // label: scope
+ ).Inc()
runnerGroups, ghResp, err := r.ghcEnterpriseCli.ListRunnerGroups(r.ctx, r.cfg.Name, &opts)
if err != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "ListOrganizationRunnerGroups", // label: operation
+ metricsLabelEnterpriseScope, // label: scope
+ ).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return nil, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching runners")
}
@@ -123,8 +132,16 @@ func (r *enterprise) GetJITConfig(ctx context.Context, instance string, pool par
// TODO(gabriel-samfira): Should we make this configurable?
WorkFolder: github.String("_work"),
}
+ metrics.GithubOperationCount.WithLabelValues(
+ "GenerateEnterpriseJITConfig", // label: operation
+ metricsLabelEnterpriseScope, // label: scope
+ ).Inc()
jitConfig, resp, err := r.ghcEnterpriseCli.GenerateEnterpriseJITConfig(ctx, r.cfg.Name, &req)
if err != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "GenerateEnterpriseJITConfig", // label: operation
+ metricsLabelEnterpriseScope, // label: scope
+ ).Inc()
if resp != nil && resp.StatusCode == http.StatusUnauthorized {
return nil, nil, fmt.Errorf("failed to get JIT config: %w", err)
}
@@ -134,7 +151,17 @@ func (r *enterprise) GetJITConfig(ctx context.Context, instance string, pool par
runner = jitConfig.Runner
defer func() {
if err != nil && runner != nil {
+ metrics.GithubOperationCount.WithLabelValues(
+ "RemoveRunner", // label: operation
+ metricsLabelEnterpriseScope, // label: scope
+ ).Inc()
_, innerErr := r.ghcEnterpriseCli.RemoveRunner(r.ctx, r.cfg.Name, runner.GetID())
+ if innerErr != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "RemoveRunner", // label: operation
+ metricsLabelEnterpriseScope, // label: scope
+ ).Inc()
+ }
slog.With(slog.Any("error", innerErr)).ErrorContext(
ctx, "failed to remove runner",
"runner_id", runner.GetID(), "organization", r.cfg.Name)
@@ -166,8 +193,16 @@ func (r *enterprise) GetRunnerInfoFromWorkflow(job params.WorkflowJob) (params.R
if err := r.ValidateOwner(job); err != nil {
return params.RunnerInfo{}, errors.Wrap(err, "validating owner")
}
+ metrics.GithubOperationCount.WithLabelValues(
+ "GetWorkflowJobByID", // label: operation
+ metricsLabelEnterpriseScope, // label: scope
+ ).Inc()
workflow, ghResp, err := r.ghcli.GetWorkflowJobByID(r.ctx, job.Repository.Owner.Login, job.Repository.Name, job.WorkflowJob.ID)
if err != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "GetWorkflowJobByID", // label: operation
+ metricsLabelEnterpriseScope, // label: scope
+ ).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return params.RunnerInfo{}, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching workflow info")
}
@@ -212,8 +247,16 @@ func (r *enterprise) GetGithubRunners() ([]*github.Runner, error) {
var allRunners []*github.Runner
for {
+ metrics.GithubOperationCount.WithLabelValues(
+ "ListRunners", // label: operation
+ metricsLabelEnterpriseScope, // label: scope
+ ).Inc()
runners, ghResp, err := r.ghcEnterpriseCli.ListRunners(r.ctx, r.cfg.Name, &opts)
if err != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "ListRunners", // label: operation
+ metricsLabelEnterpriseScope, // label: scope
+ ).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return nil, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching runners")
}
@@ -231,8 +274,16 @@ func (r *enterprise) GetGithubRunners() ([]*github.Runner, error) {
func (r *enterprise) FetchTools() ([]commonParams.RunnerApplicationDownload, error) {
r.mux.Lock()
defer r.mux.Unlock()
+ metrics.GithubOperationCount.WithLabelValues(
+ "ListRunnerApplicationDownloads", // label: operation
+ metricsLabelEnterpriseScope, // label: scope
+ ).Inc()
tools, ghResp, err := r.ghcEnterpriseCli.ListRunnerApplicationDownloads(r.ctx, r.cfg.Name)
if err != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "ListRunnerApplicationDownloads", // label: operation
+ metricsLabelEnterpriseScope, // label: scope
+ ).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return nil, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching runners")
}
@@ -255,7 +306,19 @@ func (r *enterprise) FetchDbInstances() ([]params.Instance, error) {
}
func (r *enterprise) RemoveGithubRunner(runnerID int64) (*github.Response, error) {
- return r.ghcEnterpriseCli.RemoveRunner(r.ctx, r.cfg.Name, runnerID)
+ metrics.GithubOperationCount.WithLabelValues(
+ "RemoveRunner", // label: operation
+ metricsLabelEnterpriseScope, // label: scope
+ ).Inc()
+ ghResp, err := r.ghcEnterpriseCli.RemoveRunner(r.ctx, r.cfg.Name, runnerID)
+ if err != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "RemoveRunner", // label: operation
+ metricsLabelEnterpriseScope, // label: scope
+ ).Inc()
+ return nil, err
+ }
+ return ghResp, nil
}
func (r *enterprise) ListPools() ([]params.Pool, error) {
@@ -275,9 +338,18 @@ func (r *enterprise) JwtToken() string {
}
func (r *enterprise) GetGithubRegistrationToken() (string, error) {
+ metrics.GithubOperationCount.WithLabelValues(
+ "CreateRegistrationToken", // label: operation
+ metricsLabelEnterpriseScope, // label: scope
+ ).Inc()
+
tk, ghResp, err := r.ghcEnterpriseCli.CreateRegistrationToken(r.ctx, r.cfg.Name)
if err != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "CreateRegistrationToken", // label: operation
+ metricsLabelEnterpriseScope, // label: scope
+ ).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return "", errors.Wrap(runnerErrors.ErrUnauthorized, "fetching registration token")
}
diff --git a/runner/pool/organization.go b/runner/pool/organization.go
index 0f534508..0eee23f4 100644
--- a/runner/pool/organization.go
+++ b/runner/pool/organization.go
@@ -27,6 +27,7 @@ import (
runnerErrors "github.com/cloudbase/garm-provider-common/errors"
commonParams "github.com/cloudbase/garm-provider-common/params"
dbCommon "github.com/cloudbase/garm/database/common"
+ "github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/params"
"github.com/cloudbase/garm/runner/common"
"github.com/cloudbase/garm/util"
@@ -97,8 +98,16 @@ func (r *organization) findRunnerGroupByName(ctx context.Context, name string) (
}
for {
+ metrics.GithubOperationCount.WithLabelValues(
+ "ListOrganizationRunnerGroups", // label: operation
+ metricsLabelOrganizationScope, // label: scope
+ ).Inc()
runnerGroups, ghResp, err := r.ghcli.ListOrganizationRunnerGroups(r.ctx, r.cfg.Name, &opts)
if err != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "ListOrganizationRunnerGroups", // label: operation
+ metricsLabelOrganizationScope, // label: scope
+ ).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return nil, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching runners")
}
@@ -135,8 +144,16 @@ func (r *organization) GetJITConfig(ctx context.Context, instance string, pool p
// TODO(gabriel-samfira): Should we make this configurable?
WorkFolder: github.String("_work"),
}
+ metrics.GithubOperationCount.WithLabelValues(
+ "GenerateOrgJITConfig", // label: operation
+ metricsLabelOrganizationScope, // label: scope
+ ).Inc()
jitConfig, resp, err := r.ghcli.GenerateOrgJITConfig(ctx, r.cfg.Name, &req)
if err != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "GenerateOrgJITConfig", // label: operation
+ metricsLabelOrganizationScope, // label: scope
+ ).Inc()
if resp != nil && resp.StatusCode == http.StatusUnauthorized {
return nil, nil, fmt.Errorf("failed to get JIT config: %w", err)
}
@@ -146,7 +163,17 @@ func (r *organization) GetJITConfig(ctx context.Context, instance string, pool p
runner = jitConfig.GetRunner()
defer func() {
if err != nil && runner != nil {
+ metrics.GithubOperationCount.WithLabelValues(
+ "RemoveOrganizationRunner", // label: operation
+ metricsLabelOrganizationScope, // label: scope
+ ).Inc()
_, innerErr := r.ghcli.RemoveOrganizationRunner(r.ctx, r.cfg.Name, runner.GetID())
+ if innerErr != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "RemoveOrganizationRunner", // label: operation
+ metricsLabelOrganizationScope, // label: scope
+ ).Inc()
+ }
slog.With(slog.Any("error", innerErr)).ErrorContext(
ctx, "failed to remove runner",
"runner_id", runner.GetID(), "organization", r.cfg.Name)
@@ -178,8 +205,16 @@ func (r *organization) GetRunnerInfoFromWorkflow(job params.WorkflowJob) (params
if err := r.ValidateOwner(job); err != nil {
return params.RunnerInfo{}, errors.Wrap(err, "validating owner")
}
+ metrics.GithubOperationCount.WithLabelValues(
+ "GetWorkflowJobByID", // label: operation
+ metricsLabelOrganizationScope, // label: scope
+ ).Inc()
workflow, ghResp, err := r.ghcli.GetWorkflowJobByID(r.ctx, job.Organization.Login, job.Repository.Name, job.WorkflowJob.ID)
if err != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "GetWorkflowJobByID", // label: operation
+ metricsLabelOrganizationScope, // label: scope
+ ).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return params.RunnerInfo{}, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching workflow info")
}
@@ -223,8 +258,16 @@ func (r *organization) GetGithubRunners() ([]*github.Runner, error) {
var allRunners []*github.Runner
for {
+ metrics.GithubOperationCount.WithLabelValues(
+ "ListOrganizationRunners", // label: operation
+ metricsLabelOrganizationScope, // label: scope
+ ).Inc()
runners, ghResp, err := r.ghcli.ListOrganizationRunners(r.ctx, r.cfg.Name, &opts)
if err != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "ListOrganizationRunners", // label: operation
+ metricsLabelOrganizationScope, // label: scope
+ ).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return nil, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching runners")
}
@@ -243,8 +286,16 @@ func (r *organization) GetGithubRunners() ([]*github.Runner, error) {
func (r *organization) FetchTools() ([]commonParams.RunnerApplicationDownload, error) {
r.mux.Lock()
defer r.mux.Unlock()
+ metrics.GithubOperationCount.WithLabelValues(
+ "ListOrganizationRunnerApplicationDownloads", // label: operation
+ metricsLabelOrganizationScope, // label: scope
+ ).Inc()
tools, ghResp, err := r.ghcli.ListOrganizationRunnerApplicationDownloads(r.ctx, r.cfg.Name)
if err != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "ListOrganizationRunnerApplicationDownloads", // label: operation
+ metricsLabelOrganizationScope, // label: scope
+ ).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return nil, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching tools")
}
@@ -267,7 +318,21 @@ func (r *organization) FetchDbInstances() ([]params.Instance, error) {
}
func (r *organization) RemoveGithubRunner(runnerID int64) (*github.Response, error) {
- return r.ghcli.RemoveOrganizationRunner(r.ctx, r.cfg.Name, runnerID)
+ metrics.GithubOperationCount.WithLabelValues(
+ "RemoveRunner", // label: operation
+ metricsLabelOrganizationScope, // label: scope
+ ).Inc()
+
+ ghResp, err := r.ghcli.RemoveOrganizationRunner(r.ctx, r.cfg.Name, runnerID)
+ if err != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "RemoveRunner", // label: operation
+ metricsLabelOrganizationScope, // label: scope
+ ).Inc()
+ return nil, err
+ }
+
+ return ghResp, nil
}
func (r *organization) ListPools() ([]params.Pool, error) {
@@ -287,9 +352,17 @@ func (r *organization) JwtToken() string {
}
func (r *organization) GetGithubRegistrationToken() (string, error) {
+ metrics.GithubOperationCount.WithLabelValues(
+ "CreateOrganizationRegistrationToken", // label: operation
+ metricsLabelOrganizationScope, // label: scope
+ ).Inc()
tk, ghResp, err := r.ghcli.CreateOrganizationRegistrationToken(r.ctx, r.cfg.Name)
if err != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "CreateOrganizationRegistrationToken", // label: operation
+ metricsLabelOrganizationScope, // label: scope
+ ).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return "", errors.Wrap(runnerErrors.ErrUnauthorized, "fetching token")
}
@@ -340,8 +413,16 @@ func (r *organization) listHooks(ctx context.Context) ([]*github.Hook, error) {
}
var allHooks []*github.Hook
for {
+ metrics.GithubOperationCount.WithLabelValues(
+ "ListOrgHooks", // label: operation
+ metricsLabelOrganizationScope, // label: scope
+ ).Inc()
hooks, ghResp, err := r.ghcli.ListOrgHooks(ctx, r.cfg.Name, &opts)
if err != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "ListOrgHooks", // label: operation
+ metricsLabelOrganizationScope, // label: scope
+ ).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusNotFound {
return nil, runnerErrors.NewBadRequestError("organization not found or your PAT does not have access to manage webhooks")
}
@@ -366,12 +447,30 @@ func (r *organization) InstallHook(ctx context.Context, req *github.Hook) (param
return params.HookInfo{}, errors.Wrap(err, "validating hook request")
}
+ metrics.GithubOperationCount.WithLabelValues(
+ "CreateOrgHook", // label: operation
+ metricsLabelOrganizationScope, // label: scope
+ ).Inc()
+
hook, _, err := r.ghcli.CreateOrgHook(ctx, r.cfg.Name, req)
if err != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "CreateOrgHook", // label: operation
+ metricsLabelOrganizationScope, // label: scope
+ ).Inc()
return params.HookInfo{}, errors.Wrap(err, "creating organization hook")
}
+ metrics.GithubOperationCount.WithLabelValues(
+ "PingOrgHook", // label: operation
+ metricsLabelOrganizationScope, // label: scope
+ ).Inc()
+
if _, err := r.ghcli.PingOrgHook(ctx, r.cfg.Name, hook.GetID()); err != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "PingOrgHook", // label: operation
+ metricsLabelOrganizationScope, // label: scope
+ ).Inc()
slog.With(slog.Any("error", err)).ErrorContext(ctx, "failed to ping hook", "hook_id", hook.GetID())
}
@@ -386,8 +485,16 @@ func (r *organization) UninstallHook(ctx context.Context, url string) error {
for _, hook := range allHooks {
if hook.Config["url"] == url {
+ metrics.GithubOperationCount.WithLabelValues(
+ "DeleteOrgHook", // label: operation
+ metricsLabelOrganizationScope, // label: scope
+ ).Inc()
_, err = r.ghcli.DeleteOrgHook(ctx, r.cfg.Name, hook.GetID())
if err != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "DeleteOrgHook", // label: operation
+ metricsLabelOrganizationScope, // label: scope
+ ).Inc()
return errors.Wrap(err, "deleting hook")
}
return nil
diff --git a/runner/pool/pool.go b/runner/pool/pool.go
index b15e3677..8e69ff03 100644
--- a/runner/pool/pool.go
+++ b/runner/pool/pool.go
@@ -57,6 +57,10 @@ const (
// before we give up.
// TODO: make this configurable(?)
maxCreateAttempts = 5
+
+ metricsLabelEnterpriseScope = "Enterprise"
+ metricsLabelRepositoryScope = "Repository"
+ metricsLabelOrganizationScope = "Organization"
)
type keyMutex struct {
diff --git a/runner/pool/repository.go b/runner/pool/repository.go
index 0505eece..9ba77c13 100644
--- a/runner/pool/repository.go
+++ b/runner/pool/repository.go
@@ -27,6 +27,7 @@ import (
runnerErrors "github.com/cloudbase/garm-provider-common/errors"
commonParams "github.com/cloudbase/garm-provider-common/params"
dbCommon "github.com/cloudbase/garm/database/common"
+ "github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/params"
"github.com/cloudbase/garm/runner/common"
"github.com/cloudbase/garm/util"
@@ -99,8 +100,16 @@ func (r *repository) GetJITConfig(ctx context.Context, instance string, pool par
// TODO(gabriel-samfira): Should we make this configurable?
WorkFolder: github.String("_work"),
}
+ metrics.GithubOperationCount.WithLabelValues(
+ "GenerateRepoJITConfig", // label: operation
+ metricsLabelRepositoryScope, // label: scope
+ ).Inc()
jitConfig, resp, err := r.ghcli.GenerateRepoJITConfig(ctx, r.cfg.Owner, r.cfg.Name, &req)
if err != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "GenerateRepoJITConfig", // label: operation
+ metricsLabelRepositoryScope, // label: scope
+ ).Inc()
if resp != nil && resp.StatusCode == http.StatusUnauthorized {
return nil, nil, fmt.Errorf("failed to get JIT config: %w", err)
}
@@ -110,7 +119,17 @@ func (r *repository) GetJITConfig(ctx context.Context, instance string, pool par
defer func() {
if err != nil && runner != nil {
+ metrics.GithubOperationCount.WithLabelValues(
+ "RemoveRunner", // label: operation
+ metricsLabelRepositoryScope, // label: scope
+ ).Inc()
_, innerErr := r.ghcli.RemoveRunner(r.ctx, r.cfg.Owner, r.cfg.Name, runner.GetID())
+ if innerErr != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "RemoveRunner", // label: operation
+ metricsLabelRepositoryScope, // label: scope
+ ).Inc()
+ }
slog.With(slog.Any("error", innerErr)).ErrorContext(
ctx, "failed to remove runner",
"runner_id", runner.GetID(),
@@ -144,8 +163,16 @@ func (r *repository) GetRunnerInfoFromWorkflow(job params.WorkflowJob) (params.R
if err := r.ValidateOwner(job); err != nil {
return params.RunnerInfo{}, errors.Wrap(err, "validating owner")
}
+ metrics.GithubOperationCount.WithLabelValues(
+ "GetWorkflowJobByID", // label: operation
+ metricsLabelRepositoryScope, // label: scope
+ ).Inc()
workflow, ghResp, err := r.ghcli.GetWorkflowJobByID(r.ctx, job.Repository.Owner.Login, job.Repository.Name, job.WorkflowJob.ID)
if err != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "GetWorkflowJobByID", // label: operation
+ metricsLabelRepositoryScope, // label: scope
+ ).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return params.RunnerInfo{}, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching workflow info")
}
@@ -189,8 +216,16 @@ func (r *repository) GetGithubRunners() ([]*github.Runner, error) {
var allRunners []*github.Runner
for {
+ metrics.GithubOperationCount.WithLabelValues(
+ "ListRunners", // label: operation
+ metricsLabelRepositoryScope, // label: scope
+ ).Inc()
runners, ghResp, err := r.ghcli.ListRunners(r.ctx, r.cfg.Owner, r.cfg.Name, &opts)
if err != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "ListRunners", // label: operation
+ metricsLabelRepositoryScope, // label: scope
+ ).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return nil, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching runners")
}
@@ -209,8 +244,16 @@ func (r *repository) GetGithubRunners() ([]*github.Runner, error) {
func (r *repository) FetchTools() ([]commonParams.RunnerApplicationDownload, error) {
r.mux.Lock()
defer r.mux.Unlock()
+ metrics.GithubOperationCount.WithLabelValues(
+ "ListRunnerApplicationDownloads", // label: operation
+ metricsLabelRepositoryScope, // label: scope
+ ).Inc()
tools, ghResp, err := r.ghcli.ListRunnerApplicationDownloads(r.ctx, r.cfg.Owner, r.cfg.Name)
if err != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "ListRunnerApplicationDownloads", // label: operation
+ metricsLabelRepositoryScope, // label: scope
+ ).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return nil, errors.Wrap(runnerErrors.ErrUnauthorized, "fetching tools")
}
@@ -233,7 +276,20 @@ func (r *repository) FetchDbInstances() ([]params.Instance, error) {
}
func (r *repository) RemoveGithubRunner(runnerID int64) (*github.Response, error) {
- return r.ghcli.RemoveRunner(r.ctx, r.cfg.Owner, r.cfg.Name, runnerID)
+ metrics.GithubOperationCount.WithLabelValues(
+ "RemoveRunner", // label: operation
+ metricsLabelRepositoryScope, // label: scope
+ ).Inc()
+ ghResp, err := r.ghcli.RemoveRunner(r.ctx, r.cfg.Owner, r.cfg.Name, runnerID)
+ if err != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "RemoveRunner", // label: operation
+ metricsLabelRepositoryScope, // label: scope
+ ).Inc()
+ return nil, err
+ }
+
+ return ghResp, nil
}
func (r *repository) ListPools() ([]params.Pool, error) {
@@ -253,9 +309,17 @@ func (r *repository) JwtToken() string {
}
func (r *repository) GetGithubRegistrationToken() (string, error) {
+ metrics.GithubOperationCount.WithLabelValues(
+ "CreateRegistrationToken", // label: operation
+ metricsLabelRepositoryScope, // label: scope
+ ).Inc()
tk, ghResp, err := r.ghcli.CreateRegistrationToken(r.ctx, r.cfg.Owner, r.cfg.Name)
if err != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "CreateRegistrationToken", // label: operation
+ metricsLabelRepositoryScope, // label: scope
+ ).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusUnauthorized {
return "", errors.Wrap(runnerErrors.ErrUnauthorized, "fetching token")
}
@@ -305,8 +369,16 @@ func (r *repository) listHooks(ctx context.Context) ([]*github.Hook, error) {
}
var allHooks []*github.Hook
for {
+ metrics.GithubOperationCount.WithLabelValues(
+ "ListRepoHooks", // label: operation
+ metricsLabelRepositoryScope, // label: scope
+ ).Inc()
hooks, ghResp, err := r.ghcli.ListRepoHooks(ctx, r.cfg.Owner, r.cfg.Name, &opts)
if err != nil {
+ metrics.GithubOperationCount.WithLabelValues(
+ "ListRepoHooks", // label: operation
+ metricsLabelRepositoryScope, // label: scope
+ ).Inc()
if ghResp != nil && ghResp.StatusCode == http.StatusNotFound {
return nil, runnerErrors.NewBadRequestError("repository not found or your PAT does not have access to manage webhooks")
}
@@ -331,12 +403,30 @@ func (r *repository) InstallHook(ctx context.Context, req *github.Hook) (params.
return params.HookInfo{}, errors.Wrap(err, "validating hook request")
}
+ metrics.GithubOperationCount.WithLabelValues(
+ "CreateRepoHook", // label: operation
+ metricsLabelRepositoryScope, // label: scope
+ ).Inc()
+
hook, _, err := r.ghcli.CreateRepoHook(ctx, r.cfg.Owner, r.cfg.Name, req)
if err != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "CreateRepoHook", // label: operation
+ metricsLabelRepositoryScope, // label: scope
+ ).Inc()
return params.HookInfo{}, errors.Wrap(err, "creating repository hook")
}
+ metrics.GithubOperationCount.WithLabelValues(
+ "PingRepoHook", // label: operation
+ metricsLabelRepositoryScope, // label: scope
+ ).Inc()
+
if _, err := r.ghcli.PingRepoHook(ctx, r.cfg.Owner, r.cfg.Name, hook.GetID()); err != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "PingRepoHook", // label: operation
+ metricsLabelRepositoryScope, // label: scope
+ ).Inc()
slog.With(slog.Any("error", err)).ErrorContext(
ctx, "failed to ping hook",
"hook_id", hook.GetID(),
@@ -355,8 +445,16 @@ func (r *repository) UninstallHook(ctx context.Context, url string) error {
for _, hook := range allHooks {
if hook.Config["url"] == url {
+ metrics.GithubOperationCount.WithLabelValues(
+ "DeleteRepoHook", // label: operation
+ metricsLabelRepositoryScope, // label: scope
+ ).Inc()
_, err = r.ghcli.DeleteRepoHook(ctx, r.cfg.Owner, r.cfg.Name, hook.GetID())
if err != nil {
+ metrics.GithubOperationFailedCount.WithLabelValues(
+ "DeleteRepoHook", // label: operation
+ metricsLabelRepositoryScope, // label: scope
+ ).Inc()
return errors.Wrap(err, "deleting hook")
}
return nil
diff --git a/runner/providers/external/external.go b/runner/providers/external/external.go
index d1574047..11b614de 100644
--- a/runner/providers/external/external.go
+++ b/runner/providers/external/external.go
@@ -14,6 +14,7 @@ import (
garmErrors "github.com/cloudbase/garm-provider-common/errors"
garmExec "github.com/cloudbase/garm-provider-common/util/exec"
"github.com/cloudbase/garm/config"
+ "github.com/cloudbase/garm/metrics"
"github.com/cloudbase/garm/params"
"github.com/cloudbase/garm/runner/common"
@@ -82,17 +83,34 @@ func (e *external) CreateInstance(ctx context.Context, bootstrapParams commonPar
return commonParams.ProviderInstance{}, errors.Wrap(err, "serializing bootstrap params")
}
+ metrics.InstanceOperationCount.WithLabelValues(
+ "CreateInstance", // label: operation
+ e.cfg.Name, // label: provider
+ ).Inc()
+
out, err := garmExec.Exec(ctx, e.execPath, asJs, asEnv)
if err != nil {
+ metrics.InstanceOperationFailedCount.WithLabelValues(
+ "CreateInstance", // label: operation
+ e.cfg.Name, // label: provider
+ ).Inc()
return commonParams.ProviderInstance{}, garmErrors.NewProviderError("provider binary %s returned error: %s", e.execPath, err)
}
var param commonParams.ProviderInstance
if err := json.Unmarshal(out, ¶m); err != nil {
+ metrics.InstanceOperationFailedCount.WithLabelValues(
+ "CreateInstance", // label: operation
+ e.cfg.Name, // label: provider
+ ).Inc()
return commonParams.ProviderInstance{}, garmErrors.NewProviderError("failed to decode response from binary: %s", err)
}
if err := e.validateResult(ctx, param); err != nil {
+ metrics.InstanceOperationFailedCount.WithLabelValues(
+ "CreateInstance", // label: operation
+ e.cfg.Name, // label: provider
+ ).Inc()
return commonParams.ProviderInstance{}, garmErrors.NewProviderError("failed to validate result: %s", err)
}
@@ -113,10 +131,18 @@ func (e *external) DeleteInstance(ctx context.Context, instance string) error {
}
asEnv = append(asEnv, e.environmentVariables...)
+ metrics.InstanceOperationCount.WithLabelValues(
+ "DeleteInstance", // label: operation
+ e.cfg.Name, // label: provider
+ ).Inc()
_, err := garmExec.Exec(ctx, e.execPath, nil, asEnv)
if err != nil {
var exitErr *exec.ExitError
if !errors.As(err, &exitErr) || exitErr.ExitCode() != execution.ExitCodeNotFound {
+ metrics.InstanceOperationFailedCount.WithLabelValues(
+ "DeleteInstance", // label: operation
+ e.cfg.Name, // label: provider
+ ).Inc()
return garmErrors.NewProviderError("provider binary %s returned error: %s", e.execPath, err)
}
@@ -134,19 +160,35 @@ func (e *external) GetInstance(ctx context.Context, instance string) (commonPara
}
asEnv = append(asEnv, e.environmentVariables...)
- // TODO(gabriel-samfira): handle error types. Of particular insterest is to
+ // TODO(gabriel-samfira): handle error types. Of particular interest is to
// know when the error is ErrNotFound.
+ metrics.InstanceOperationCount.WithLabelValues(
+ "GetInstance", // label: operation
+ e.cfg.Name, // label: provider
+ ).Inc()
out, err := garmExec.Exec(ctx, e.execPath, nil, asEnv)
if err != nil {
+ metrics.InstanceOperationFailedCount.WithLabelValues(
+ "GetInstance", // label: operation
+ e.cfg.Name, // label: provider
+ ).Inc()
return commonParams.ProviderInstance{}, garmErrors.NewProviderError("provider binary %s returned error: %s", e.execPath, err)
}
var param commonParams.ProviderInstance
if err := json.Unmarshal(out, ¶m); err != nil {
+ metrics.InstanceOperationFailedCount.WithLabelValues(
+ "GetInstance", // label: operation
+ e.cfg.Name, // label: provider
+ ).Inc()
return commonParams.ProviderInstance{}, garmErrors.NewProviderError("failed to decode response from binary: %s", err)
}
if err := e.validateResult(ctx, param); err != nil {
+ metrics.InstanceOperationFailedCount.WithLabelValues(
+ "GetInstance", // label: operation
+ e.cfg.Name, // label: provider
+ ).Inc()
return commonParams.ProviderInstance{}, garmErrors.NewProviderError("failed to validate result: %s", err)
}
@@ -163,19 +205,36 @@ func (e *external) ListInstances(ctx context.Context, poolID string) ([]commonPa
}
asEnv = append(asEnv, e.environmentVariables...)
+ metrics.InstanceOperationCount.WithLabelValues(
+ "ListInstances", // label: operation
+ e.cfg.Name, // label: provider
+ ).Inc()
+
out, err := garmExec.Exec(ctx, e.execPath, nil, asEnv)
if err != nil {
+ metrics.InstanceOperationFailedCount.WithLabelValues(
+ "ListInstances", // label: operation
+ e.cfg.Name, // label: provider
+ ).Inc()
return []commonParams.ProviderInstance{}, garmErrors.NewProviderError("provider binary %s returned error: %s", e.execPath, err)
}
var param []commonParams.ProviderInstance
if err := json.Unmarshal(out, ¶m); err != nil {
+ metrics.InstanceOperationFailedCount.WithLabelValues(
+ "ListInstances", // label: operation
+ e.cfg.Name, // label: provider
+ ).Inc()
return []commonParams.ProviderInstance{}, garmErrors.NewProviderError("failed to decode response from binary: %s", err)
}
ret := make([]commonParams.ProviderInstance, len(param))
for idx, inst := range param {
if err := e.validateResult(ctx, inst); err != nil {
+ metrics.InstanceOperationFailedCount.WithLabelValues(
+ "ListInstances", // label: operation
+ e.cfg.Name, // label: provider
+ ).Inc()
return []commonParams.ProviderInstance{}, garmErrors.NewProviderError("failed to validate result: %s", err)
}
ret[idx] = inst
@@ -192,8 +251,17 @@ func (e *external) RemoveAllInstances(ctx context.Context) error {
}
asEnv = append(asEnv, e.environmentVariables...)
+ metrics.InstanceOperationCount.WithLabelValues(
+ "RemoveAllInstances", // label: operation
+ e.cfg.Name, // label: provider
+ ).Inc()
+
_, err := garmExec.Exec(ctx, e.execPath, nil, asEnv)
if err != nil {
+ metrics.InstanceOperationFailedCount.WithLabelValues(
+ "RemoveAllInstances", // label: operation
+ e.cfg.Name, // label: provider
+ ).Inc()
return garmErrors.NewProviderError("provider binary %s returned error: %s", e.execPath, err)
}
return nil
@@ -209,8 +277,16 @@ func (e *external) Stop(ctx context.Context, instance string, force bool) error
}
asEnv = append(asEnv, e.environmentVariables...)
+ metrics.InstanceOperationCount.WithLabelValues(
+ "Stop", // label: operation
+ e.cfg.Name, // label: provider
+ ).Inc()
_, err := garmExec.Exec(ctx, e.execPath, nil, asEnv)
if err != nil {
+ metrics.InstanceOperationFailedCount.WithLabelValues(
+ "Stop", // label: operation
+ e.cfg.Name, // label: provider
+ ).Inc()
return garmErrors.NewProviderError("provider binary %s returned error: %s", e.execPath, err)
}
return nil
@@ -226,8 +302,17 @@ func (e *external) Start(ctx context.Context, instance string) error {
}
asEnv = append(asEnv, e.environmentVariables...)
+ metrics.InstanceOperationCount.WithLabelValues(
+ "Start", // label: operation
+ e.cfg.Name, // label: provider
+ ).Inc()
+
_, err := garmExec.Exec(ctx, e.execPath, nil, asEnv)
if err != nil {
+ metrics.InstanceOperationFailedCount.WithLabelValues(
+ "Start", // label: operation
+ e.cfg.Name, // label: provider
+ ).Inc()
return garmErrors.NewProviderError("provider binary %s returned error: %s", e.execPath, err)
}
return nil