diff --git a/cmd/backrest/backrest.go b/cmd/backrest/backrest.go index f08a5acb7..2ed95daf3 100644 --- a/cmd/backrest/backrest.go +++ b/cmd/backrest/backrest.go @@ -21,6 +21,7 @@ import ( "github.com/garethgeorge/backrest/internal/config" "github.com/garethgeorge/backrest/internal/env" "github.com/garethgeorge/backrest/internal/logwriter" + "github.com/garethgeorge/backrest/internal/metric" "github.com/garethgeorge/backrest/internal/oplog" "github.com/garethgeorge/backrest/internal/oplog/bboltstore" "github.com/garethgeorge/backrest/internal/orchestrator" @@ -116,6 +117,7 @@ func main() { mux.Handle(backrestHandlerPath, auth.RequireAuthentication(backrestHandler, authenticator)) mux.Handle("/", webui.Handler()) mux.Handle("/download/", http.StripPrefix("/download", api.NewDownloadHandler(oplog))) + mux.Handle("/metrics", metric.GetRegistry().Handler()) // Serve the HTTP gateway server := &http.Server{ diff --git a/go.sum b/go.sum index e12d5edea..c58613cda 100644 --- a/go.sum +++ b/go.sum @@ -215,4 +215,4 @@ gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= -gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= \ No newline at end of file diff --git a/internal/hook/hook.go b/internal/hook/hook.go index 0a8ca179f..aafabdf61 100644 --- a/internal/hook/hook.go +++ b/internal/hook/hook.go @@ -66,6 +66,7 @@ func newOneoffRunHookTask(title, instanceID, repoID, planID string, parentOp *v1 return &tasks.GenericOneoffTask{ OneoffTask: tasks.OneoffTask{ BaseTask: tasks.BaseTask{ + TaskType: "hook", TaskName: fmt.Sprintf("run hook %v", title), TaskRepoID: repoID, TaskPlanID: planID, diff --git a/internal/metric/metric.go b/internal/metric/metric.go new file mode 100644 index 000000000..30855f35f --- /dev/null +++ b/internal/metric/metric.go @@ -0,0 +1,84 @@ +package metric + +import ( + "net/http" + "slices" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" +) + +var ( + globalRegistry = initRegistry() +) + +func initRegistry() *Registry { + + commonDims := []string{"repo_id", "plan_id"} + + registry := &Registry{ + reg: prometheus.NewRegistry(), + backupBytesProcessed: prometheus.NewSummaryVec(prometheus.SummaryOpts{ + Name: "backrest_backup_bytes_processed", + Help: "The total number of bytes processed during a backup", + }, commonDims), + backupBytesAdded: prometheus.NewSummaryVec(prometheus.SummaryOpts{ + Name: "backrest_backup_bytes_added", + Help: "The total number of bytes added during a backup", + }, commonDims), + backupFileWarnings: prometheus.NewSummaryVec(prometheus.SummaryOpts{ + Name: "backrest_backup_file_warnings", + Help: "The total number of file warnings during a backup", + }, commonDims), + tasksDuration: prometheus.NewSummaryVec(prometheus.SummaryOpts{ + Name: "backrest_tasks_duration_secs", + Help: "The duration of a task in seconds", + }, append(slices.Clone(commonDims), "task_type")), + tasksRun: prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "backrest_tasks_run_total", + Help: "The total number of tasks run", + }, append(slices.Clone(commonDims), "task_type", "status")), + } + + registry.reg.MustRegister(registry.backupBytesProcessed) + registry.reg.MustRegister(registry.backupBytesAdded) + registry.reg.MustRegister(registry.backupFileWarnings) + registry.reg.MustRegister(registry.tasksDuration) + registry.reg.MustRegister(registry.tasksRun) + + return registry +} + +func GetRegistry() *Registry { + return globalRegistry +} + +type Registry struct { + reg *prometheus.Registry + backupBytesProcessed *prometheus.SummaryVec + backupBytesAdded *prometheus.SummaryVec + backupFileWarnings *prometheus.SummaryVec + tasksDuration *prometheus.SummaryVec + tasksRun *prometheus.CounterVec +} + +func (r *Registry) Handler() http.Handler { + return promhttp.HandlerFor(r.reg, promhttp.HandlerOpts{}) +} + +func (r *Registry) RecordTaskRun(repoID, planID, taskType string, duration_secs float64, status string) { + if repoID == "" { + repoID = "_unassociated_" + } + if planID == "" { + planID = "_unassociated_" + } + r.tasksRun.WithLabelValues(repoID, planID, taskType, status).Inc() + r.tasksDuration.WithLabelValues(repoID, planID, taskType).Observe(duration_secs) +} + +func (r *Registry) RecordBackupSummary(repoID, planID string, bytesProcessed, bytesAdded int64, fileWarnings int64) { + r.backupBytesProcessed.WithLabelValues(repoID, planID).Observe(float64(bytesProcessed)) + r.backupBytesAdded.WithLabelValues(repoID, planID).Observe(float64(bytesAdded)) + r.backupFileWarnings.WithLabelValues(repoID, planID).Observe(float64(fileWarnings)) +} diff --git a/internal/orchestrator/orchestrator.go b/internal/orchestrator/orchestrator.go index e418701bf..da36e19f2 100644 --- a/internal/orchestrator/orchestrator.go +++ b/internal/orchestrator/orchestrator.go @@ -12,6 +12,7 @@ import ( v1 "github.com/garethgeorge/backrest/gen/go/v1" "github.com/garethgeorge/backrest/internal/config" "github.com/garethgeorge/backrest/internal/logwriter" + "github.com/garethgeorge/backrest/internal/metric" "github.com/garethgeorge/backrest/internal/oplog" "github.com/garethgeorge/backrest/internal/orchestrator/logging" "github.com/garethgeorge/backrest/internal/orchestrator/repo" @@ -426,6 +427,7 @@ func (o *Orchestrator) RunTask(ctx context.Context, st tasks.ScheduledTask) erro runner.Logger(ctx).Error("task failed", zap.Error(err), zap.Duration("duration", time.Since(start))) } else { runner.Logger(ctx).Info("task finished", zap.Duration("duration", time.Since(start))) + metric.GetRegistry().RecordTaskRun(st.Task.RepoID(), st.Task.PlanID(), st.Task.Type(), time.Since(start).Seconds(), "success") } if op != nil { diff --git a/internal/orchestrator/tasks/task.go b/internal/orchestrator/tasks/task.go index 9df8d4ee7..b34886393 100644 --- a/internal/orchestrator/tasks/task.go +++ b/internal/orchestrator/tasks/task.go @@ -86,6 +86,7 @@ func (s ScheduledTask) Less(other ScheduledTask) bool { // Task is a task that can be scheduled to run at a specific time. type Task interface { Name() string // human readable name for this task. + Type() string // simple string 'type' for this task. Next(now time.Time, runner TaskRunner) (ScheduledTask, error) // returns the next scheduled task. Run(ctx context.Context, st ScheduledTask, runner TaskRunner) error // run the task. PlanID() string // the ID of the plan this task is associated with. @@ -93,11 +94,16 @@ type Task interface { } type BaseTask struct { + TaskType string TaskName string TaskPlanID string TaskRepoID string } +func (b BaseTask) Type() string { + return b.TaskType +} + func (b BaseTask) Name() string { return b.TaskName } @@ -164,7 +170,7 @@ type testTaskRunner struct { var _ TaskRunner = &testTaskRunner{} -func newTestTaskRunner(t testing.TB, config *v1.Config, oplog *oplog.OpLog) *testTaskRunner { +func newTestTaskRunner(_ testing.TB, config *v1.Config, oplog *oplog.OpLog) *testTaskRunner { return &testTaskRunner{ config: config, oplog: oplog, diff --git a/internal/orchestrator/tasks/taskbackup.go b/internal/orchestrator/tasks/taskbackup.go index 9ba4971da..bc7dc51d9 100644 --- a/internal/orchestrator/tasks/taskbackup.go +++ b/internal/orchestrator/tasks/taskbackup.go @@ -9,6 +9,7 @@ import ( "time" v1 "github.com/garethgeorge/backrest/gen/go/v1" + "github.com/garethgeorge/backrest/internal/metric" "github.com/garethgeorge/backrest/internal/oplog" "github.com/garethgeorge/backrest/internal/protoutil" "github.com/garethgeorge/backrest/pkg/restic" @@ -29,6 +30,7 @@ var _ Task = &BackupTask{} func NewScheduledBackupTask(plan *v1.Plan) *BackupTask { return &BackupTask{ BaseTask: BaseTask{ + TaskType: "backup", TaskName: fmt.Sprintf("backup for plan %q", plan.Id), TaskRepoID: plan.Repo, TaskPlanID: plan.Id, @@ -39,6 +41,7 @@ func NewScheduledBackupTask(plan *v1.Plan) *BackupTask { func NewOneoffBackupTask(plan *v1.Plan, at time.Time) *BackupTask { return &BackupTask{ BaseTask: BaseTask{ + TaskType: "backup", TaskName: fmt.Sprintf("backup for plan %q", plan.Id), TaskRepoID: plan.Repo, TaskPlanID: plan.Id, @@ -132,6 +135,7 @@ func (t *BackupTask) Run(ctx context.Context, st ScheduledTask, runner TaskRunne var sendWg sync.WaitGroup lastSent := time.Now() // debounce progress updates, these can endup being very frequent. var lastFiles []string + fileErrorCount := 0 summary, err := repo.Backup(ctx, plan, func(entry *restic.BackupProgressEntry) { sendWg.Wait() if entry.MessageType == "status" { @@ -145,6 +149,7 @@ func (t *BackupTask) Run(ctx context.Context, st ScheduledTask, runner TaskRunne backupOp.OperationBackup.LastStatus = protoutil.BackupProgressEntryToProto(entry) } else if entry.MessageType == "error" { l.Sugar().Warnf("an unknown error was encountered in processing item: %v", entry.Item) + fileErrorCount++ backupError, err := protoutil.BackupProgressEntryToBackupError(entry) if err != nil { l.Sugar().Errorf("failed to convert backup progress entry to backup error: %v", err) @@ -180,6 +185,8 @@ func (t *BackupTask) Run(ctx context.Context, st ScheduledTask, runner TaskRunne summary = &restic.BackupProgressEntry{} } + metric.GetRegistry().RecordBackupSummary(t.RepoID(), t.PlanID(), summary.TotalBytesProcessed, summary.DataAdded, int64(fileErrorCount)) + vars := HookVars{ Task: t.Name(), SnapshotStats: summary, diff --git a/internal/orchestrator/tasks/taskcheck.go b/internal/orchestrator/tasks/taskcheck.go index 4455438ec..753f6aeeb 100644 --- a/internal/orchestrator/tasks/taskcheck.go +++ b/internal/orchestrator/tasks/taskcheck.go @@ -20,6 +20,7 @@ type CheckTask struct { func NewCheckTask(repoID, planID string, force bool) Task { return &CheckTask{ BaseTask: BaseTask{ + TaskType: "check", TaskName: fmt.Sprintf("check for repo %q", repoID), TaskRepoID: repoID, TaskPlanID: planID, diff --git a/internal/orchestrator/tasks/taskcollectgarbage.go b/internal/orchestrator/tasks/taskcollectgarbage.go index 5be5b10ec..29e6573de 100644 --- a/internal/orchestrator/tasks/taskcollectgarbage.go +++ b/internal/orchestrator/tasks/taskcollectgarbage.go @@ -35,6 +35,7 @@ type CollectGarbageTask struct { func NewCollectGarbageTask() *CollectGarbageTask { return &CollectGarbageTask{ BaseTask: BaseTask{ + TaskType: "collect_garbage", TaskName: "collect garbage", }, } diff --git a/internal/orchestrator/tasks/taskforget.go b/internal/orchestrator/tasks/taskforget.go index af27ba2fa..f72b79688 100644 --- a/internal/orchestrator/tasks/taskforget.go +++ b/internal/orchestrator/tasks/taskforget.go @@ -16,6 +16,7 @@ func NewOneoffForgetTask(repoID, planID string, flowID int64, at time.Time) Task return &GenericOneoffTask{ OneoffTask: OneoffTask{ BaseTask: BaseTask{ + TaskType: "forget", TaskName: fmt.Sprintf("forget for plan %q in repo %q", repoID, planID), TaskRepoID: repoID, TaskPlanID: planID, diff --git a/internal/orchestrator/tasks/taskforgetsnapshot.go b/internal/orchestrator/tasks/taskforgetsnapshot.go index 46c4351d9..ef3174f4b 100644 --- a/internal/orchestrator/tasks/taskforgetsnapshot.go +++ b/internal/orchestrator/tasks/taskforgetsnapshot.go @@ -12,6 +12,7 @@ func NewOneoffForgetSnapshotTask(repoID, planID string, flowID int64, at time.Ti return &GenericOneoffTask{ OneoffTask: OneoffTask{ BaseTask: BaseTask{ + TaskType: "forget_snapshot", TaskName: fmt.Sprintf("forget snapshot %q for plan %q in repo %q", snapshotID, planID, repoID), TaskRepoID: repoID, TaskPlanID: planID, diff --git a/internal/orchestrator/tasks/taskindexsnapshots.go b/internal/orchestrator/tasks/taskindexsnapshots.go index 27f909e0e..a9f9eabc2 100644 --- a/internal/orchestrator/tasks/taskindexsnapshots.go +++ b/internal/orchestrator/tasks/taskindexsnapshots.go @@ -19,6 +19,7 @@ func NewOneoffIndexSnapshotsTask(repoID string, at time.Time) Task { return &GenericOneoffTask{ OneoffTask: OneoffTask{ BaseTask: BaseTask{ + TaskType: "index_snapshots", TaskName: fmt.Sprintf("index snapshots for repo %q", repoID), TaskRepoID: repoID, }, diff --git a/internal/orchestrator/tasks/taskprune.go b/internal/orchestrator/tasks/taskprune.go index 0350b67ba..63eba4cd6 100644 --- a/internal/orchestrator/tasks/taskprune.go +++ b/internal/orchestrator/tasks/taskprune.go @@ -21,6 +21,7 @@ type PruneTask struct { func NewPruneTask(repoID, planID string, force bool) Task { return &PruneTask{ BaseTask: BaseTask{ + TaskType: "prune", TaskName: fmt.Sprintf("prune repo %q", repoID), TaskRepoID: repoID, TaskPlanID: planID, diff --git a/internal/orchestrator/tasks/taskrestore.go b/internal/orchestrator/tasks/taskrestore.go index 11d705100..db3f331e8 100644 --- a/internal/orchestrator/tasks/taskrestore.go +++ b/internal/orchestrator/tasks/taskrestore.go @@ -15,6 +15,7 @@ func NewOneoffRestoreTask(repoID, planID string, flowID int64, at time.Time, sna return &GenericOneoffTask{ OneoffTask: OneoffTask{ BaseTask: BaseTask{ + TaskType: "restore", TaskName: fmt.Sprintf("restore snapshot %q in repo %q", snapshotID, repoID), TaskRepoID: repoID, TaskPlanID: planID, diff --git a/internal/orchestrator/tasks/taskstats.go b/internal/orchestrator/tasks/taskstats.go index 35fad6a0e..6a541300d 100644 --- a/internal/orchestrator/tasks/taskstats.go +++ b/internal/orchestrator/tasks/taskstats.go @@ -18,6 +18,7 @@ type StatsTask struct { func NewStatsTask(repoID, planID string, force bool) Task { return &StatsTask{ BaseTask: BaseTask{ + TaskType: "stats", TaskName: fmt.Sprintf("stats for repo %q", repoID), TaskRepoID: repoID, TaskPlanID: planID,