diff --git a/charts/fleet/templates/deployment.yaml b/charts/fleet/templates/deployment.yaml index b31a1d6ddd..4b002e30d5 100644 --- a/charts/fleet/templates/deployment.yaml +++ b/charts/fleet/templates/deployment.yaml @@ -63,6 +63,9 @@ spec: {{- if not .Values.gitops.enabled }} - --disable-gitops {{- end }} + {{- if not .Values.metrics.enabled }} + - --disable-metrics + {{- end }} {{- if .Values.debug }} - --debug - --debug-level diff --git a/charts/fleet/templates/service.yaml b/charts/fleet/templates/service.yaml new file mode 100644 index 0000000000..cc17e9b285 --- /dev/null +++ b/charts/fleet/templates/service.yaml @@ -0,0 +1,17 @@ +{{- if .Values.metrics.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: monitoring-fleet-controller + labels: + app: fleet-controller +spec: + type: ClusterIP + ports: + - port: 8080 + targetPort: 8080 + protocol: TCP + name: metrics + selector: + app: fleet-controller +{{- end }} diff --git a/charts/fleet/values.yaml b/charts/fleet/values.yaml index 8ff514834b..e70157ea8b 100644 --- a/charts/fleet/values.yaml +++ b/charts/fleet/values.yaml @@ -66,6 +66,9 @@ priorityClassName: "" gitops: enabled: true +metrics: + enabled: true + debug: false debugLevel: 0 propagateDebugSettingsToAgents: true diff --git a/go.mod b/go.mod index d87b5045bc..1344df95b2 100644 --- a/go.mod +++ b/go.mod @@ -29,6 +29,7 @@ require ( github.com/onsi/gomega v1.30.0 github.com/otiai10/copy v1.14.0 github.com/pkg/errors v0.9.1 + github.com/prometheus/client_golang v1.18.0 github.com/rancher/fleet/pkg/apis v0.0.0-00010101000000-000000000000 github.com/rancher/lasso v0.0.0-20230830164424-d684fdeb6f29 github.com/rancher/wrangler/v2 v2.1.2 @@ -185,7 +186,6 @@ require ( github.com/pjbgf/sha1cd v0.3.0 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect - github.com/prometheus/client_golang v1.18.0 // indirect github.com/prometheus/client_model v0.5.0 // indirect github.com/prometheus/common v0.45.0 // indirect github.com/prometheus/procfs v0.12.0 // indirect diff --git a/internal/cmd/controller/operator.go b/internal/cmd/controller/operator.go index 064bd05c87..9c453edb45 100644 --- a/internal/cmd/controller/operator.go +++ b/internal/cmd/controller/operator.go @@ -8,6 +8,7 @@ import ( "github.com/rancher/fleet/internal/cmd/controller/reconciler" "github.com/rancher/fleet/internal/cmd/controller/target" "github.com/rancher/fleet/internal/manifest" + "github.com/rancher/fleet/internal/metrics" "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" "k8s.io/apimachinery/pkg/runtime" @@ -29,12 +30,31 @@ func init() { //+kubebuilder:scaffold:scheme } -func start(ctx context.Context, systemNamespace string, config *rest.Config, leaderOpts LeaderElectionOptions, bindAddresses BindAddresses, disableGitops bool) error { - setupLog.Info("listening for changes on local cluster", "disableGitops", disableGitops) +func start( + ctx context.Context, + systemNamespace string, + config *rest.Config, + leaderOpts LeaderElectionOptions, + bindAddresses BindAddresses, + disableGitops bool, + disableMetrics bool, +) error { + setupLog.Info("listening for changes on local cluster", + "disableGitops", disableGitops, + "disableMetrics", disableMetrics, + ) + + var metricServerOptions metricsserver.Options + if disableMetrics { + metricServerOptions = metricsserver.Options{BindAddress: "0"} + } else { + metricServerOptions = metricsserver.Options{BindAddress: bindAddresses.Metrics} + metrics.RegisterMetrics() // enable fleet related metrics + } mgr, err := ctrl.NewManager(config, ctrl.Options{ Scheme: scheme, - Metrics: metricsserver.Options{BindAddress: bindAddresses.Metrics}, + Metrics: metricServerOptions, HealthProbeBindAddress: bindAddresses.HealthProbe, LeaderElection: true, diff --git a/internal/cmd/controller/reconciler/bundle_controller.go b/internal/cmd/controller/reconciler/bundle_controller.go index 072a6f48a3..e3556f4445 100644 --- a/internal/cmd/controller/reconciler/bundle_controller.go +++ b/internal/cmd/controller/reconciler/bundle_controller.go @@ -8,6 +8,7 @@ import ( "github.com/rancher/fleet/internal/cmd/controller/summary" "github.com/rancher/fleet/internal/cmd/controller/target" "github.com/rancher/fleet/internal/manifest" + "github.com/rancher/fleet/internal/metrics" fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -96,18 +97,21 @@ func (r *BundleReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr if err := resetStatus(&bundle.Status, matchedTargets); err != nil { updateDisplay(&bundle.Status) + metrics.CollectBundleMetrics(bundle) return ctrl.Result{}, err } // this will add the defaults for a new bundledeployment if err := target.UpdatePartitions(&bundle.Status, matchedTargets); err != nil { updateDisplay(&bundle.Status) + metrics.CollectBundleMetrics(bundle) return ctrl.Result{}, err } if bundle.Status.ObservedGeneration != bundle.Generation { if err := setResourceKey(context.Background(), &bundle.Status, bundle, manifest, r.isNamespaced); err != nil { updateDisplay(&bundle.Status) + metrics.CollectBundleMetrics(bundle) return ctrl.Result{}, err } } @@ -144,6 +148,7 @@ func (r *BundleReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr } updateDisplay(&bundle.Status) + metrics.CollectBundleMetrics(bundle) err = retry.RetryOnConflict(retry.DefaultRetry, func() error { t := &fleet.Bundle{} err := r.Get(ctx, req.NamespacedName, t) diff --git a/internal/cmd/controller/reconciler/bundledeployment_controller.go b/internal/cmd/controller/reconciler/bundledeployment_controller.go index bc358f2ba8..16de1ad212 100644 --- a/internal/cmd/controller/reconciler/bundledeployment_controller.go +++ b/internal/cmd/controller/reconciler/bundledeployment_controller.go @@ -7,6 +7,7 @@ import ( "reflect" "github.com/rancher/fleet/internal/cmd/controller/summary" + "github.com/rancher/fleet/internal/metrics" fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" "github.com/rancher/wrangler/v2/pkg/genericcondition" @@ -68,7 +69,13 @@ func (r *BundleDeploymentReconciler) Reconcile(ctx context.Context, req ctrl.Req return err } t.Status = bd.Status - return r.Status().Update(ctx, t) + err = r.Status().Update(ctx, t) + if err != nil { + return err + } + + metrics.CollectBundleDeploymentMetrics(t) + return nil }) if err != nil { logger.V(1).Error(err, "Reconcile failed final update to bundle deployment status", "status", bd.Status) diff --git a/internal/cmd/controller/reconciler/cluster_controller.go b/internal/cmd/controller/reconciler/cluster_controller.go index fc1f9b9ea1..6100769961 100644 --- a/internal/cmd/controller/reconciler/cluster_controller.go +++ b/internal/cmd/controller/reconciler/cluster_controller.go @@ -9,6 +9,7 @@ import ( "time" "github.com/rancher/fleet/internal/cmd/controller/summary" + "github.com/rancher/fleet/internal/metrics" fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" "github.com/rancher/fleet/pkg/durations" "github.com/sirupsen/logrus" @@ -167,6 +168,8 @@ func (r *ClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct logger.V(1).Error(err, "Reconcile failed final update to cluster status", "status", cluster.Status) } + metrics.CollectClusterMetrics(cluster) + if allReady && cluster.Status.ResourceCounts.Ready != cluster.Status.ResourceCounts.DesiredReady { logrus.Debugf("Cluster %s/%s is not ready because not all gitrepos are ready: %d/%d, enqueue cluster again", cluster.Namespace, cluster.Name, cluster.Status.ResourceCounts.Ready, cluster.Status.ResourceCounts.DesiredReady) diff --git a/internal/cmd/controller/reconciler/clustergroup_controller.go b/internal/cmd/controller/reconciler/clustergroup_controller.go index aec0166dc9..6e9c865a9e 100644 --- a/internal/cmd/controller/reconciler/clustergroup_controller.go +++ b/internal/cmd/controller/reconciler/clustergroup_controller.go @@ -8,6 +8,7 @@ import ( "reflect" "strings" + "github.com/rancher/fleet/internal/metrics" fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" "k8s.io/apimachinery/pkg/runtime" @@ -71,6 +72,8 @@ func (r *ClusterGroupReconciler) Reconcile(ctx context.Context, req ctrl.Request }) if err != nil { logger.V(1).Error(err, "Reconcile failed final update to cluster group status", "status", group.Status) + } else { + metrics.CollectClusterGroupMetrics(group) } return ctrl.Result{}, err diff --git a/internal/cmd/controller/reconciler/gitrepo_controller.go b/internal/cmd/controller/reconciler/gitrepo_controller.go index 6fab0a4ba2..bfbf3d1c19 100644 --- a/internal/cmd/controller/reconciler/gitrepo_controller.go +++ b/internal/cmd/controller/reconciler/gitrepo_controller.go @@ -8,6 +8,7 @@ import ( grutil "github.com/rancher/fleet/internal/cmd/controller/gitrepo" "github.com/rancher/fleet/internal/cmd/controller/imagescan" + "github.com/rancher/fleet/internal/metrics" fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" "github.com/reugn/go-quartz/quartz" @@ -68,6 +69,8 @@ func (r *GitRepoReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct return ctrl.Result{}, nil } + metrics.CollectGitRepoMetrics(gitrepo) + logger = logger.WithValues("commit", gitrepo.Status.Commit) logger.V(1).Info("Reconciling GitRepo", "lastAccepted", acceptedLastUpdate(gitrepo.Status.Conditions)) diff --git a/internal/cmd/controller/root.go b/internal/cmd/controller/root.go index 7219292f91..92bc80c5ab 100644 --- a/internal/cmd/controller/root.go +++ b/internal/cmd/controller/root.go @@ -31,9 +31,10 @@ import ( type FleetManager struct { command.DebugConfig - Kubeconfig string `usage:"Kubeconfig file"` - Namespace string `usage:"namespace to watch" default:"cattle-fleet-system" env:"NAMESPACE"` - DisableGitops bool `usage:"disable gitops components" name:"disable-gitops"` + Kubeconfig string `usage:"Kubeconfig file"` + Namespace string `usage:"namespace to watch" default:"cattle-fleet-system" env:"NAMESPACE"` + DisableGitops bool `usage:"disable gitops components" name:"disable-gitops"` + DisableMetrics bool `usage:"disable metrics" name:"disable-metrics"` } type LeaderElectionOptions struct { @@ -128,7 +129,14 @@ func (f *FleetManager) Run(cmd *cobra.Command, args []string) error { go func() { log.Println(http.ListenAndServe("localhost:6060", nil)) // nolint:gosec // Debugging only }() - if err := start(ctx, f.Namespace, kubeconfig, leaderOpts, bindAddresses, f.DisableGitops); err != nil { + if err := start( + ctx, f.Namespace, + kubeconfig, + leaderOpts, + bindAddresses, + f.DisableGitops, + f.DisableMetrics, + ); err != nil { return err } diff --git a/internal/cmd/controller/summary/summary.go b/internal/cmd/controller/summary/summary.go index 59c93a6dab..c3083fb63a 100644 --- a/internal/cmd/controller/summary/summary.go +++ b/internal/cmd/controller/summary/summary.go @@ -72,6 +72,8 @@ func IncrementResourceCounts(left *fleet.GitRepoResourceCounts, right fleet.GitR left.NotReady += right.NotReady } +// GetSummaryState returns the summary state of a bundle. The returns value is +// empty if the bundle is ready. func GetSummaryState(summary fleet.BundleSummary) fleet.BundleState { var state fleet.BundleState for _, nonReady := range summary.NonReadyResources { diff --git a/internal/metrics/bundle_metrics.go b/internal/metrics/bundle_metrics.go new file mode 100644 index 0000000000..59b41af268 --- /dev/null +++ b/internal/metrics/bundle_metrics.go @@ -0,0 +1,157 @@ +package metrics + +import ( + "fmt" + + "github.com/rancher/fleet/internal/cmd/controller/summary" + fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" + "sigs.k8s.io/controller-runtime/pkg/metrics" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" +) + +var ( + bundleSubsystem = "bundle" + bundleLabels = []string{"name", "namespace", "commit", "repo", "generation", "state"} + + bundleNotReadyDeployments = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: bundleSubsystem, + Name: "not_ready", + Help: "Number of deployments for a specific bundle in a not ready state.", + }, + bundleLabels, + ) + bundleWaitAppliedDeployments = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: bundleSubsystem, + Name: "wait_applied", + Help: "Number of deployments for a specific bundle in a wait applied state.", + }, + bundleLabels, + ) + bundleErrAppliedDeployments = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: bundleSubsystem, + Name: "err_applied", + Help: "Number of deployments for a specific bundle in a error applied state.", + }, + bundleLabels, + ) + bundleOutOfSyncDeployments = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: bundleSubsystem, + Name: "out_of_sync", + Help: "Number of deployments for a specific bundle in a out of sync state.", + }, + bundleLabels, + ) + bundleModifiedDeployments = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: bundleSubsystem, + Name: "modified", + Help: "Number of deployments for a specific bundle in a modified state.", + }, + bundleLabels, + ) + bundleReadyDeployments = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: bundleSubsystem, + Name: "ready", + Help: "Number of deployments for a specific bundle in a ready state.", + }, + bundleLabels, + ) + bundlePendingDeployments = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: bundleSubsystem, + Name: "pending", + Help: "Number of deployments for a specific bundle in a pending state.", + }, + bundleLabels, + ) + bundleDesiredReadyDeployments = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: bundleSubsystem, + Name: "desired_ready", + Help: "Number of deployments that are desired to be ready for a bundle.", + }, + bundleLabels, + ) + bundleObserved = promauto.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: bundleSubsystem, + Name: "observations_total", + Help: "The total times that this bundle has been observed", + }, + bundleLabels, + ) + bundleState = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: bundleSubsystem, + Name: "state", + Help: "Shows the state of this bundle based on the state label. A value of 1 is true 0 is false.", + }, + bundleLabels, + ) +) + +func CollectBundleMetrics(bundle *fleet.Bundle) { + if !enabled { + return + } + + labels := prometheus.Labels{ + "name": bundle.Name, + "namespace": bundle.Namespace, + "commit": bundle.ObjectMeta.Labels[commitLabel], + "repo": bundle.ObjectMeta.Labels[repoNameLabel], + "generation": fmt.Sprintf("%d", bundle.ObjectMeta.Generation), + "state": string(summary.GetSummaryState(bundle.Status.Summary)), + } + + bundleNotReadyDeployments.With(labels).Set(float64(bundle.Status.Summary.NotReady)) + bundleWaitAppliedDeployments.With(labels).Set(float64(bundle.Status.Summary.WaitApplied)) + bundleErrAppliedDeployments.With(labels).Set(float64(bundle.Status.Summary.ErrApplied)) + bundleOutOfSyncDeployments.With(labels).Set(float64(bundle.Status.Summary.OutOfSync)) + bundleModifiedDeployments.With(labels).Set(float64(bundle.Status.Summary.Modified)) + bundleReadyDeployments.With(labels).Set(float64(bundle.Status.Summary.Ready)) + bundlePendingDeployments.With(labels).Set(float64(bundle.Status.Summary.Pending)) + bundleDesiredReadyDeployments.With(labels).Set(float64(bundle.Status.Summary.DesiredReady)) + bundleObserved.With(labels).Inc() + + currentState := summary.GetSummaryState(bundle.Status.Summary) + + for _, state := range bundleStates { + labels["state"] = string(state) + + if state == currentState { + bundleState.With(labels).Set(1) + } else { + bundleState.With(labels).Set(0) + } + } +} + +func registerBundleMetrics() { + metrics.Registry.MustRegister(bundleNotReadyDeployments) + metrics.Registry.MustRegister(bundleWaitAppliedDeployments) + metrics.Registry.MustRegister(bundleErrAppliedDeployments) + metrics.Registry.MustRegister(bundleOutOfSyncDeployments) + metrics.Registry.MustRegister(bundleModifiedDeployments) + metrics.Registry.MustRegister(bundleReadyDeployments) + metrics.Registry.MustRegister(bundlePendingDeployments) + metrics.Registry.MustRegister(bundleDesiredReadyDeployments) + metrics.Registry.MustRegister(bundleObserved) +} diff --git a/internal/metrics/bundledeployment_metrics.go b/internal/metrics/bundledeployment_metrics.go new file mode 100644 index 0000000000..837da56220 --- /dev/null +++ b/internal/metrics/bundledeployment_metrics.go @@ -0,0 +1,86 @@ +package metrics + +import ( + "fmt" + + "github.com/rancher/fleet/internal/cmd/controller/summary" + fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" + "sigs.k8s.io/controller-runtime/pkg/metrics" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" +) + +var ( + bundledeploymentSubsystem = "bundledeployment" + bundledeploymentLabels = []string{ + "name", + "namespace", + "cluster_name", + "cluster_namespace", + "repo", + "commit", + "bundle", + "bundle_namespace", + "generation", + "state", + } + + bundleDeploymentState = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: bundledeploymentSubsystem, + Name: "state", + Help: "Shows the state of this bundle deployment based on the state label. " + + "A value of 1 is true 0 is false.", + }, + bundledeploymentLabels, + ) + + bundleDeploymentObserved = promauto.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: bundledeploymentSubsystem, + Name: "observations_total", + Help: "The total times that this bundle deployment has been observed", + }, + bundledeploymentLabels, + ) +) + +func CollectBundleDeploymentMetrics(bundleDep *fleet.BundleDeployment) { + if !enabled { + return + } + + labels := prometheus.Labels{ + "name": bundleDep.Name, + "namespace": bundleDep.Namespace, + "cluster_name": bundleDep.ObjectMeta.Labels["fleet.cattle.io/cluster"], + "cluster_namespace": bundleDep.ObjectMeta.Labels["fleet.cattle.io/cluster-namespace"], + "repo": bundleDep.ObjectMeta.Labels[repoNameLabel], + "commit": bundleDep.ObjectMeta.Labels[commitLabel], + "bundle": bundleDep.ObjectMeta.Labels["fleet.cattle.io/bundle-name"], + "bundle_namespace": bundleDep.ObjectMeta.Labels["fleet.cattle.io/bundle-namespace"], + "generation": fmt.Sprintf("%d", bundleDep.ObjectMeta.Generation), + "state": string(summary.GetDeploymentState(bundleDep)), + } + bundleDeploymentObserved.With(labels).Inc() + + currentState := summary.GetDeploymentState(bundleDep) + + for _, state := range bundleStates { + labels["state"] = string(state) + + if state == currentState { + bundleDeploymentState.With(labels).Set(1) + } else { + bundleDeploymentState.With(labels).Set(0) + } + } +} + +func registerBundleDeploymentMetrics() { + metrics.Registry.MustRegister(bundleDeploymentState) + metrics.Registry.MustRegister(bundleDeploymentObserved) +} diff --git a/internal/metrics/cluster_metrics.go b/internal/metrics/cluster_metrics.go new file mode 100644 index 0000000000..040f14bd18 --- /dev/null +++ b/internal/metrics/cluster_metrics.go @@ -0,0 +1,213 @@ +package metrics + +import ( + "fmt" + + fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" + "sigs.k8s.io/controller-runtime/pkg/metrics" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" +) + +var ( + clusterSubsystem = "cluster" + clusterLabels = []string{ + "name", + "namespace", + "cluster_name", + "cluster_display_name", + "generation", + "state", + } + + clusterNameLabel = "management.cattle.io/cluster-name" + clusterDisplayNameLabel = "management.cattle.io/cluster-display-name" + clusterStates = []string{ + string(fleet.NotReady), + string(fleet.Ready), + "WaitCheckIn", + } + + clusterAgentNodesReady = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: clusterSubsystem, + Name: "agent_nodes_ready", + Help: "The number of fleet agents in a Ready status for a given cluster.", + }, + clusterLabels, + ) + clusterAgentNodesNotReady = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: clusterSubsystem, + Name: "agent_nodes_not_ready", + Help: "The number of fleet agents not in a Ready status for a given cluster.", + }, + clusterLabels, + ) + clusterDesiredReadyGitRepos = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: clusterSubsystem, + Name: "desired_ready_git_repos", + Help: "The desired number of GitRepos to be in a ready state.", + }, + clusterLabels, + ) + clusterReadyGitRepos = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: clusterSubsystem, + Name: "ready_git_repos", + Help: "The number of GitRepos in a ready state.", + }, + clusterLabels, + ) + clusterResourcesDesiredReady = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: clusterSubsystem, + Name: "resources_count_desiredready", + Help: "The number of resources for the given cluster desired to be in the Ready state.", + }, + clusterLabels, + ) + clusterResourcesMissing = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: clusterSubsystem, + Name: "resources_count_missing", + Help: "The number of resources in the Missing state.", + }, + clusterLabels, + ) + clusterResourcesModified = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: clusterSubsystem, + Name: "resources_count_modified", + Help: "The number of resources in the Modified state.", + }, + clusterLabels, + ) + clusterResourcesNotReady = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: clusterSubsystem, + Name: "resources_count_notready", + Help: "The number of resources in the NotReady state.", + }, + clusterLabels, + ) + clusterResourcesOrphaned = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: clusterSubsystem, + Name: "resources_count_orphaned", + Help: "The number of resources in the Orphaned state.", + }, + clusterLabels, + ) + clusterResourcesReady = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: clusterSubsystem, + Name: "resources_count_ready", + Help: "The number of resources in the Ready state.", + }, + clusterLabels, + ) + clusterResourcesUnknown = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: clusterSubsystem, + Name: "resources_count_unknown", + Help: "The number of resources in the Unknown state.", + }, + clusterLabels, + ) + clusterResourcesWaitApplied = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: clusterSubsystem, + Name: "resources_count_waitapplied", + Help: "The number of resources in the WaitApplied state.", + }, + clusterLabels, + ) + clusterState = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: clusterSubsystem, + Name: "state", + Help: "The current state of a given cluster", + }, + clusterLabels, + ) + clusterObserved = promauto.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: clusterSubsystem, + Name: "observations_total", + Help: "The total times that this cluster has been observed", + }, + clusterLabels, + ) +) + +func CollectClusterMetrics(cluster *fleet.Cluster) { + if !enabled { + return + } + + labels := prometheus.Labels{ + "name": cluster.Name, + "namespace": cluster.Namespace, + "cluster_name": cluster.ObjectMeta.Labels[clusterNameLabel], + "cluster_display_name": cluster.ObjectMeta.Labels[clusterDisplayNameLabel], + "generation": fmt.Sprintf("%d", cluster.ObjectMeta.Generation), + "state": cluster.Status.Display.State, + } + + clusterAgentNodesReady.With(labels).Set(float64(cluster.Status.Agent.ReadyNodes)) + clusterAgentNodesNotReady.With(labels).Set(float64(cluster.Status.Agent.NonReadyNodes)) + clusterDesiredReadyGitRepos.With(labels).Set(float64(cluster.Status.DesiredReadyGitRepos)) + clusterReadyGitRepos.With(labels).Set(float64(cluster.Status.ReadyGitRepos)) + clusterResourcesDesiredReady.With(labels).Set(float64(cluster.Status.ResourceCounts.DesiredReady)) + clusterResourcesMissing.With(labels).Set(float64(cluster.Status.ResourceCounts.Missing)) + clusterResourcesModified.With(labels).Set(float64(cluster.Status.ResourceCounts.Modified)) + clusterResourcesNotReady.With(labels).Set(float64(cluster.Status.ResourceCounts.NotReady)) + clusterResourcesOrphaned.With(labels).Set(float64(cluster.Status.ResourceCounts.Orphaned)) + clusterResourcesReady.With(labels).Set(float64(cluster.Status.ResourceCounts.Ready)) + clusterResourcesUnknown.With(labels).Set(float64(cluster.Status.ResourceCounts.Unknown)) + clusterResourcesWaitApplied.With(labels).Set(float64(cluster.Status.ResourceCounts.WaitApplied)) + clusterObserved.With(labels).Inc() + + for _, state := range clusterStates { + labels["state"] = state + + if state == cluster.Status.Display.State { + clusterState.With(labels).Set(1) + } else { + clusterState.With(labels).Set(0) + } + } +} + +func registerClusterMetrics() { + metrics.Registry.MustRegister(clusterAgentNodesReady) + metrics.Registry.MustRegister(clusterAgentNodesNotReady) + metrics.Registry.MustRegister(clusterDesiredReadyGitRepos) + metrics.Registry.MustRegister(clusterReadyGitRepos) + metrics.Registry.MustRegister(clusterResourcesDesiredReady) + metrics.Registry.MustRegister(clusterResourcesMissing) + metrics.Registry.MustRegister(clusterResourcesModified) + metrics.Registry.MustRegister(clusterResourcesNotReady) + metrics.Registry.MustRegister(clusterResourcesOrphaned) + metrics.Registry.MustRegister(clusterResourcesReady) + metrics.Registry.MustRegister(clusterResourcesUnknown) + metrics.Registry.MustRegister(clusterResourcesWaitApplied) + metrics.Registry.MustRegister(clusterObserved) +} diff --git a/internal/metrics/clustergroup_metrics.go b/internal/metrics/clustergroup_metrics.go new file mode 100644 index 0000000000..42ae4e732a --- /dev/null +++ b/internal/metrics/clustergroup_metrics.go @@ -0,0 +1,200 @@ +package metrics + +import ( + "fmt" + + fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" + "sigs.k8s.io/controller-runtime/pkg/metrics" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" +) + +var ( + clusterGroupSubsystem = "cluster_group" + clusterGroupLabels = []string{"name", "namespace", "generation", "state"} + clusterGroupStates = []string{ + string(fleet.NotReady), + string(fleet.Ready), + } + + clusterGroupClusterCount = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: clusterGroupSubsystem, + Name: "cluster_count", + Help: "The count of clusters in this cluster group.", + }, + clusterGroupLabels, + ) + clusterGroupNonReadyClusterCount = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: clusterGroupSubsystem, + Name: "non_ready_cluster_count", + Help: "The count of non ready clusters in this cluster group.", + }, + clusterGroupLabels, + ) + clusterGroupResourcesDesiredReady = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: clusterGroupSubsystem, + Name: "resource_count_desired_ready", + Help: "The count of resources that are desired to be in the Ready state.", + }, + clusterGroupLabels, + ) + clusterGroupResourcesMissing = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: clusterGroupSubsystem, + Name: "resource_count_missing", + Help: "The count of resources that are in a Missing state.", + }, + clusterGroupLabels, + ) + clusterGroupResourcesModified = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: clusterGroupSubsystem, + Name: "resource_count_modified", + Help: "The count of resources that are in a Modified state.", + }, + clusterGroupLabels, + ) + clusterGroupResourcesNotReady = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: clusterGroupSubsystem, + Name: "resource_count_notready", + Help: "The count of resources that are in a NotReady state.", + }, + clusterGroupLabels, + ) + clusterGroupResourcesOrphaned = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: clusterGroupSubsystem, + Name: "resource_count_orphaned", + Help: "The count of resources that are in a Orphaned state.", + }, + clusterGroupLabels, + ) + clusterGroupResourcesReady = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: clusterGroupSubsystem, + Name: "resource_count_ready", + Help: "The count of resources that are in a Ready state.", + }, + clusterGroupLabels, + ) + clusterGroupResourcesUnknown = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: clusterGroupSubsystem, + Name: "resource_count_unknown", + Help: "The count of resources that are in a Unknown state.", + }, + clusterGroupLabels, + ) + clusterGroupResourcesWaitApplied = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: clusterGroupSubsystem, + Name: "resource_count_waitapplied", + Help: "The count of resources that are in a WaitApplied state.", + }, + clusterGroupLabels, + ) + clusterGroupDesiredReadyBundles = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: clusterGroupSubsystem, + Name: "bundle_desired_ready", + Help: "The count of bundles that are desired to be in a Ready state.", + }, + clusterGroupLabels, + ) + clusterGroupReadyBundles = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: clusterGroupSubsystem, + Name: "bundle_ready", + Help: "The count of bundles that are in a Ready state in the Cluster Group.", + }, + clusterGroupLabels, + ) + clusterGroupState = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: clusterGroupSubsystem, + Name: "state", + Help: "The current state of a given cluster group.", + }, + clusterGroupLabels, + ) + clusterGroupObserved = promauto.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: clusterGroupSubsystem, + Name: "cluster_group_observed_total", + Help: "The total times that this cluster group has been observed", + }, + clusterGroupLabels, + ) +) + +func CollectClusterGroupMetrics(clusterGroup *fleet.ClusterGroup) { + if !enabled { + return + } + + labels := prometheus.Labels{ + "name": clusterGroup.Name, + "namespace": clusterGroup.Namespace, + "generation": fmt.Sprintf("%d", clusterGroup.ObjectMeta.Generation), + "state": clusterGroup.Status.Display.State, + } + + clusterGroupClusterCount.With(labels).Set(float64(clusterGroup.Status.ClusterCount)) + clusterGroupNonReadyClusterCount.With(labels).Set(float64(clusterGroup.Status.NonReadyClusterCount)) + clusterGroupResourcesDesiredReady.With(labels).Set(float64(clusterGroup.Status.ResourceCounts.DesiredReady)) + clusterGroupResourcesMissing.With(labels).Set(float64(clusterGroup.Status.ResourceCounts.Missing)) + clusterGroupResourcesModified.With(labels).Set(float64(clusterGroup.Status.ResourceCounts.Modified)) + clusterGroupResourcesNotReady.With(labels).Set(float64(clusterGroup.Status.ResourceCounts.NotReady)) + clusterGroupResourcesOrphaned.With(labels).Set(float64(clusterGroup.Status.ResourceCounts.Orphaned)) + clusterGroupResourcesReady.With(labels).Set(float64(clusterGroup.Status.ResourceCounts.Ready)) + clusterGroupResourcesUnknown.With(labels).Set(float64(clusterGroup.Status.ResourceCounts.Unknown)) + clusterGroupResourcesWaitApplied.With(labels).Set(float64(clusterGroup.Status.ResourceCounts.WaitApplied)) + clusterGroupDesiredReadyBundles.With(labels).Set(float64(clusterGroup.Status.Summary.DesiredReady)) + clusterGroupReadyBundles.With(labels).Set(float64(clusterGroup.Status.Summary.Ready)) + clusterGroupObserved.With(labels).Inc() + + for _, state := range clusterGroupStates { + labels["state"] = state + + if state == clusterGroup.Status.Display.State { + clusterGroupState.With(labels).Set(1) + } else { + clusterGroupState.With(labels).Set(0) + } + } +} + +func registerClusterGroupMetrics() { + metrics.Registry.MustRegister(clusterGroupClusterCount) + metrics.Registry.MustRegister(clusterGroupNonReadyClusterCount) + metrics.Registry.MustRegister(clusterGroupResourcesDesiredReady) + metrics.Registry.MustRegister(clusterGroupResourcesMissing) + metrics.Registry.MustRegister(clusterGroupResourcesModified) + metrics.Registry.MustRegister(clusterGroupResourcesNotReady) + metrics.Registry.MustRegister(clusterGroupResourcesOrphaned) + metrics.Registry.MustRegister(clusterGroupResourcesReady) + metrics.Registry.MustRegister(clusterGroupResourcesUnknown) + metrics.Registry.MustRegister(clusterGroupResourcesWaitApplied) + metrics.Registry.MustRegister(clusterGroupDesiredReadyBundles) + metrics.Registry.MustRegister(clusterGroupReadyBundles) + metrics.Registry.MustRegister(clusterGroupObserved) +} diff --git a/internal/metrics/gitrepo_metrics.go b/internal/metrics/gitrepo_metrics.go new file mode 100644 index 0000000000..01c875d9a9 --- /dev/null +++ b/internal/metrics/gitrepo_metrics.go @@ -0,0 +1,158 @@ +package metrics + +import ( + "strings" + + fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" + "sigs.k8s.io/controller-runtime/pkg/metrics" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" +) + +var ( + gitRepoSubsystem = "gitrepo" + gitRepoLabels = []string{"name", "namespace", "repo", "branch", "paths", "commit"} + + gitrepoResourcesDesiredReady = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: gitRepoSubsystem, + Name: "resources_desired_ready", + Help: "The count of resources that are desired to be in a Ready state.", + }, + gitRepoLabels, + ) + gitrepoResourcesMissing = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: gitRepoSubsystem, + Name: "resources_missing", + Help: "The count of resources that are in a Missing state.", + }, + gitRepoLabels, + ) + gitrepoResourcesModified = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: gitRepoSubsystem, + Name: "resources_modified", + Help: "The count of resources that are in a Modified state.", + }, + gitRepoLabels, + ) + gitrepoResourcesNotReady = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: gitRepoSubsystem, + Name: "resources_not_ready", + Help: "The count of resources that are in a NotReady state.", + }, + gitRepoLabels, + ) + gitrepoResourcesOrphaned = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: gitRepoSubsystem, + Name: "resources_orphaned", + Help: "The count of resources that are in an Orphaned state.", + }, + gitRepoLabels, + ) + gitrepoResourcesReady = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: gitRepoSubsystem, + Name: "resources_ready", + Help: "The count of resources that are in a Ready state.", + }, + gitRepoLabels, + ) + gitrepoResourcesUnknown = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: gitRepoSubsystem, + Name: "resources_unknown", + Help: "The count of resources that are in an Unknown state.", + }, + gitRepoLabels, + ) + gitrepoResourcesWaitApplied = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: gitRepoSubsystem, + Name: "resources_wait_applied", + Help: "The count of resources that are in a WaitApplied state.", + }, + gitRepoLabels, + ) + gitrepoDesiredReadyClusters = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: gitRepoSubsystem, + Name: "desired_ready_clusters", + Help: "The amount of clusters desired to be in a ready state.", + }, + gitRepoLabels, + ) + gitrepoReadyClusters = promauto.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Subsystem: gitRepoSubsystem, + Name: "ready_clusters", + Help: "The count of cluster in a Ready state.", + }, + gitRepoLabels, + ) + gitrepoObserved = promauto.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Subsystem: gitRepoSubsystem, + Name: "observations_total", + Help: "The total times that this GitRepo has been observed", + }, + gitRepoLabels, + ) +) + +func CollectGitRepoMetrics(gitrepo *fleet.GitRepo) { + if !enabled { + return + } + + labels := prometheus.Labels{ + "name": gitrepo.Name, + "namespace": gitrepo.Namespace, + "repo": gitrepo.Spec.Repo, + "branch": gitrepo.Spec.Branch, + "paths": strings.Join(gitrepo.Spec.Paths, ";"), + "commit": gitrepo.Status.Commit, + } + gitrepoDesiredReadyClusters.With(labels).Set(float64(gitrepo.Status.DesiredReadyClusters)) + gitrepoReadyClusters.With(labels).Set(float64(gitrepo.Status.ReadyClusters)) + + gitrepoResourcesMissing.With(labels).Set(float64(gitrepo.Status.ResourceCounts.Missing)) + gitrepoResourcesModified.With(labels).Set(float64(gitrepo.Status.ResourceCounts.Modified)) + gitrepoResourcesNotReady.With(labels).Set(float64(gitrepo.Status.ResourceCounts.NotReady)) + gitrepoResourcesOrphaned.With(labels).Set(float64(gitrepo.Status.ResourceCounts.Orphaned)) + gitrepoResourcesDesiredReady.With(labels).Set(float64(gitrepo.Status.ResourceCounts.DesiredReady)) + gitrepoResourcesReady.With(labels).Set(float64(gitrepo.Status.ResourceCounts.Ready)) + gitrepoResourcesUnknown.With(labels).Set(float64(gitrepo.Status.ResourceCounts.Unknown)) + gitrepoResourcesWaitApplied.With(labels).Set(float64(gitrepo.Status.ResourceCounts.WaitApplied)) + + gitrepoObserved.With(labels).Inc() +} + +func registerGitRepoMetrics() { + metrics.Registry.MustRegister(gitrepoDesiredReadyClusters) + metrics.Registry.MustRegister(gitrepoReadyClusters) + metrics.Registry.MustRegister(gitrepoResourcesMissing) + metrics.Registry.MustRegister(gitrepoResourcesModified) + metrics.Registry.MustRegister(gitrepoResourcesNotReady) + metrics.Registry.MustRegister(gitrepoResourcesOrphaned) + metrics.Registry.MustRegister(gitrepoResourcesDesiredReady) + metrics.Registry.MustRegister(gitrepoResourcesReady) + metrics.Registry.MustRegister(gitrepoResourcesUnknown) + metrics.Registry.MustRegister(gitrepoResourcesWaitApplied) + metrics.Registry.MustRegister(gitrepoObserved) +} diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go new file mode 100644 index 0000000000..e140a76bd6 --- /dev/null +++ b/internal/metrics/metrics.go @@ -0,0 +1,31 @@ +package metrics + +import fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1" + +var ( + // The namespace for the metrics, not the Kubernetes namespace of the + // resources. This is the prefix for the metric (e.g. `fleet_` for value + // `fleet`). + namespace = "fleet" + bundleStates = []fleet.BundleState{ + fleet.Ready, + fleet.NotReady, + fleet.Pending, + fleet.OutOfSync, + fleet.Modified, + fleet.WaitApplied, + fleet.ErrApplied, + } + commitLabel = "fleet.cattle.io/commit" + repoNameLabel = "fleet.cattle.io/repo-name" + enabled = false +) + +func RegisterMetrics() { + enabled = true + registerBundleDeploymentMetrics() + registerBundleMetrics() + registerClusterGroupMetrics() + registerClusterMetrics() + registerGitRepoMetrics() +}