Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SURE-4340] Add Prometheus Metrics #2172

Merged
merged 3 commits into from
Apr 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions .github/workflows/e2e-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -106,21 +106,24 @@ jobs:
# 1. Run test cases not needing infra
ginkgo --label-filter='!infra-setup' e2e/single-cluster e2e/keep-resources

# 2. Run tests requiring only the git server
# 2. Run tests for metrics
ginkgo e2e/metrics

# 3. Run tests requiring only the git server
e2e/testenv/infra/infra setup --git-server=true
ginkgo --label-filter='infra-setup && !helm-registry && !oci-registry' e2e/single-cluster/

# 3. Run tests requiring a Helm registry
# 4. Run tests requiring a Helm registry
e2e/testenv/infra/infra setup --helm-registry=true
ginkgo --label-filter='helm-registry' e2e/single-cluster

e2e/testenv/infra/infra teardown --helm-registry=true

# 4. Run tests requiring an OCI registry
# 5. Run tests requiring an OCI registry
e2e/testenv/infra/infra setup --oci-registry=true
ginkgo --label-filter='oci-registry' e2e/single-cluster

# 5. Tear down all infra
# 6. Tear down all infra
e2e/testenv/infra/infra teardown

-
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@ e2e/testenv/infra/infra
^fleet$
FleetCI-RootCA
.envrc
env.multi-cluster
env.single-cluster
13 changes: 12 additions & 1 deletion charts/fleet/templates/deployment.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{{ $shards := list "" }}
{{ if len .Values.shards }}
{{ if .Values.shards }}
{{ $shards = concat $shards .Values.shards | uniq }}
{{ end }}
{{ range $shards }}
Expand All @@ -16,6 +16,9 @@ spec:
labels:
app: fleet-controller
shard: "{{ . }}"
{{- if empty . }}
shard-default: "true"
{{- end }}
spec:
containers:
- env:
Expand Down Expand Up @@ -64,6 +67,11 @@ spec:
image: '{{ template "system_default_registry" $ }}{{ $.Values.image.repository }}:{{ $.Values.image.tag }}'
name: fleet-controller
imagePullPolicy: "{{ $.Values.image.imagePullPolicy }}"
{{- if $.Values.metrics.enabled }}
ports:
- containerPort: 8080
name: metrics
{{- end }}
command:
- fleetcontroller
{{- if not $.Values.gitops.enabled }}
Expand All @@ -73,6 +81,9 @@ spec:
- --shard-id
- {{ quote . }}
{{- end }}
{{- if not $.Values.metrics.enabled }}
- --disable-metrics
{{- end }}
{{- if $.Values.debug }}
- --debug
- --debug-level
Expand Down
17 changes: 17 additions & 0 deletions charts/fleet/templates/service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{{- if .Values.metrics.enabled }}
apiVersion: v1
kind: Service
metadata:
name: monitoring-fleet-controller
labels:
app: fleet-controller
spec:
type: ClusterIP
ports:
- port: 8080
targetPort: 8080
protocol: TCP
name: metrics
selector:
app: fleet-controller
{{- end }}
3 changes: 3 additions & 0 deletions charts/fleet/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ priorityClassName: ""
gitops:
enabled: true

metrics:
enabled: true
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want metrics to be enabled by default? 🤔

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm open for discussions, but I thought it would not be too expensive to just collect and make them available by default.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's try enabling by default and see if it's costly


debug: false
debugLevel: 0
propagateDebugSettingsToAgents: true
Expand Down
11 changes: 11 additions & 0 deletions e2e/assets/clustergroup-template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
apiVersion: fleet.cattle.io/v1alpha1
kind: ClusterGroup
metadata:
name: {{ .Name }}
namespace: {{ .Namespace }}
spec:
selector:
matchLabels:
{{- range $key, $value := .MatchLabels}}
{{$key}}: {{$value}}
{{- end}}
13 changes: 13 additions & 0 deletions e2e/assets/metrics/fleetcontroller_service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
apiVersion: v1
kind: Service
metadata:
name: {{ .Name }}
spec:
selector:
app: fleet-controller
shard-default: "true"
ports:
- protocol: TCP
port: {{ .Port }}
targetPort: metrics
type: LoadBalancer
181 changes: 181 additions & 0 deletions e2e/metrics/bundle_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
package metrics_test

import (
"fmt"
"maps"
"math/rand"
"time"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

"github.com/rancher/fleet/e2e/metrics"
"github.com/rancher/fleet/e2e/testenv"
"github.com/rancher/fleet/e2e/testenv/kubectl"
fleet "github.com/rancher/fleet/pkg/apis/fleet.cattle.io/v1alpha1"
)

var _ = Describe("Bundle Metrics", Label("bundle"), func() {
const (
gitRepoName = "metrics"
branch = "master"
)

var (
// kw is the kubectl command for namespace the workload is deployed to
kw kubectl.Command
namespace string
)

BeforeEach(func() {
k = env.Kubectl.Namespace(env.Namespace)
namespace = testenv.NewNamespaceName(
gitRepoName,
rand.New(rand.NewSource(time.Now().UnixNano())),
)
kw = k.Namespace(namespace)

out, err := k.Create("ns", namespace)
Expect(err).ToNot(HaveOccurred(), out)

err = testenv.CreateGitRepo(
kw,
namespace,
gitRepoName,
branch,
"simple-manifest",
)
Expect(err).ToNot(HaveOccurred())

DeferCleanup(func() {
out, err = k.Delete("ns", namespace)
Expect(err).ToNot(HaveOccurred(), out)
})
})

When("collecting Bundle metrics", func() {
bundleMetricNames := map[string]map[string][]string{
"fleet_bundle_desired_ready": {},
"fleet_bundle_err_applied": {},
"fleet_bundle_modified": {},
"fleet_bundle_not_ready": {},
"fleet_bundle_out_of_sync": {},
"fleet_bundle_pending": {},
"fleet_bundle_ready": {},
"fleet_bundle_wait_applied": {},
"fleet_bundle_state": {
"state": []string{
string(fleet.Ready),
string(fleet.NotReady),
string(fleet.WaitApplied),
string(fleet.ErrApplied),
string(fleet.OutOfSync),
string(fleet.Pending),
string(fleet.Modified),
},
},
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we deliberately not testing the state metric?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I must have forgotten it. It needs to be tested differently.


// checkMetrics checks that the metrics exist or not exist. Custom
// checks can be added by passing a function to check. This can be used
// to check for the value of the metrics.
checkMetrics := func(
gitRepoName string,
expectExists bool,
check func(metric *metrics.Metric) error,
) func() error {
return func() error {
et := metrics.NewExporterTest(metricsURL)
expectOne := func(metricName string, labels map[string]string) error {
metric, err := et.FindOneMetric(metricName, labels)
if expectExists && err != nil {
return err
} else if !expectExists && err == nil {
return fmt.Errorf("metric %s found but not expected", metricName)
}

if check != nil {
err = check(metric)
if err != nil {
return err
}
}
return nil
}

for metricName, matchLabels := range bundleMetricNames {
identityLabels := map[string]string{
"name": gitRepoName,
"namespace": namespace,
}
labels := map[string]string{}
maps.Copy(labels, identityLabels)

if len(matchLabels) > 0 {
for labelName, labelValues := range matchLabels {
for _, labelValue := range labelValues {
labels[labelName] = labelValue
err := expectOne(metricName, labels)
if err != nil {
return err
}
}
}
} else {
err := expectOne(metricName, labels)
if err != nil {
return err
}
}
}
return nil
}
}

It("should have one metric for each specified metric and label value", func() {
Eventually(checkMetrics(gitRepoName+"-simple-manifest", true, func(metric *metrics.Metric) error {
// No cluster exists in the namespace where our GitRepo has been deployed, hence
// we expect the values of the metrics to be 0.
if value := metric.Gauge.GetValue(); value != float64(0) {
return fmt.Errorf("unexpected metric value: expected 0, found %f", value)
}
return nil
})).ShouldNot(HaveOccurred())
})

When("the GitRepo (and therefore Bundle) is changed", Label("bundle-modified"), func() {
It("should not duplicate metrics if Bundle is updated", Label("bundle-update"), func() {
// et := metrics.NewExporterTest(metricsURL)
out, err := kw.Patch(
"gitrepo", gitRepoName,
"--type=json",
"-p", `[{"op": "replace", "path": "/spec/paths", "value": ["simple-chart"]}]`,
)
Expect(err).ToNot(HaveOccurred(), out)
Expect(out).To(ContainSubstring("gitrepo.fleet.cattle.io/metrics patched"))

Eventually(checkMetrics(gitRepoName+"-simple-chart", true, func(metric *metrics.Metric) error {
if metric.LabelValue("paths") == "simple-manifest" {
return fmt.Errorf("path for metric %s unchanged", metric.Metric.String())
}
return nil
})).ShouldNot(HaveOccurred())
})

It("should not keep metrics if Bundle is deleted", Label("bundle-delete"), func() {
gitRepoName := gitRepoName + "-simple-manifest"

var (
out string
err error
)
Eventually(func() error {
out, err = kw.Delete("bundle", gitRepoName)
return err
}).ShouldNot(HaveOccurred(), out)

Eventually(checkMetrics(gitRepoName, false, nil)).ShouldNot(HaveOccurred())
})
})
})
})
Loading