From fae0286fa2f51bad87865b59418a9adc014ef6ec Mon Sep 17 00:00:00 2001 From: Nick Peluso <10912027+nap32@users.noreply.github.com> Date: Fri, 8 Sep 2023 11:32:12 -0700 Subject: [PATCH] metrics: Add metrics label filter configuration Currently, metrics are all-or-nothing. Certain labels may cause cardinality issues. This patch introduces a new configuration option - MetricsLabelFilter. It is an allow-list for configuring namespace, workload, pod, and binary. Labels that utilize these fields will only add them if configured for it. Fixes: #1037 Signed-off-by: Nick Peluso <10912027+nap32@users.noreply.github.com> --- cmd/tetragon/flags.go | 18 +++++++----- .../templates/tetragon_configmap.yaml | 3 ++ install/kubernetes/values.yaml | 3 ++ pkg/metrics/consts/consts.go | 1 + pkg/metrics/eventmetrics/eventmetrics.go | 8 ++--- pkg/metrics/metrics.go | 19 ++++++++++++ pkg/metrics/metrics_test.go | 29 +++++++++++++++++++ pkg/metrics/syscallmetrics/syscallmetrics.go | 4 +-- pkg/option/config.go | 19 +++++++++--- 9 files changed, 86 insertions(+), 18 deletions(-) diff --git a/cmd/tetragon/flags.go b/cmd/tetragon/flags.go index 986de7aada9..3ddec22888c 100644 --- a/cmd/tetragon/flags.go +++ b/cmd/tetragon/flags.go @@ -31,14 +31,15 @@ const ( keyEnableCiliumAPI = "enable-cilium-api" keyEnableProcessAncestors = "enable-process-ancestors" - keyMetricsServer = "metrics-server" - keyServerAddress = "server-address" - keyGopsAddr = "gops-address" - keyEnableProcessCred = "enable-process-cred" - keyEnableProcessNs = "enable-process-ns" - keyConfigFile = "config-file" - keyTracingPolicy = "tracing-policy" - keyTracingPolicyDir = "tracing-policy-dir" + keyMetricsServer = "metrics-server" + keyMetricsLabelFilter = "metrics-label-filter" + keyServerAddress = "server-address" + keyGopsAddr = "gops-address" + keyEnableProcessCred = "enable-process-cred" + keyEnableProcessNs = "enable-process-ns" + keyConfigFile = "config-file" + keyTracingPolicy = "tracing-policy" + keyTracingPolicyDir = "tracing-policy-dir" keyCpuProfile = "cpuprofile" keyMemProfile = "memprofile" @@ -114,6 +115,7 @@ func readAndSetFlags() { option.Config.DataCacheSize = viper.GetInt(keyDataCacheSize) option.Config.MetricsServer = viper.GetString(keyMetricsServer) + option.Config.MetricsLabelFilter = viper.GetStringMap(keyMetricsLabelFilter) option.Config.ServerAddress = viper.GetString(keyServerAddress) option.Config.ExportFilename = viper.GetString(keyExportFilename) diff --git a/install/kubernetes/templates/tetragon_configmap.yaml b/install/kubernetes/templates/tetragon_configmap.yaml index 242e9c9d959..0c10c60a376 100644 --- a/install/kubernetes/templates/tetragon_configmap.yaml +++ b/install/kubernetes/templates/tetragon_configmap.yaml @@ -37,6 +37,9 @@ data: {{- else }} metrics-server: "" {{- end }} +{{- if .Values.tetragon.prometheus.enabled }} + metrics-label-filter: {{.Values.tetragon.prometheus.metricsLabelFilter | join "," }} +{{- end }} {{- if .Values.tetragon.grpc.enabled }} server-address: {{ .Values.tetragon.grpc.address }} {{- else }} diff --git a/install/kubernetes/values.yaml b/install/kubernetes/values.yaml index 717d10453f0..0215737a025 100644 --- a/install/kubernetes/values.yaml +++ b/install/kubernetes/values.yaml @@ -126,6 +126,9 @@ tetragon: address: "" # -- The port at which to expose metrics. port: 2112 + # -- The labels to include with supporting metrics. + # The possible values are "namespace", "workload", "pod" and "binary". + metricsLabelFilter: ["namespace", "workload", "pod", "binary"] serviceMonitor: # -- Whether to create a 'ServiceMonitor' resource targeting the 'tetragon' pods. enabled: false diff --git a/pkg/metrics/consts/consts.go b/pkg/metrics/consts/consts.go index f7534053268..a55642f537f 100644 --- a/pkg/metrics/consts/consts.go +++ b/pkg/metrics/consts/consts.go @@ -4,3 +4,4 @@ package consts var MetricsNamespace = "tetragon" +var KnownMetricLabelFilters = []string{"namespace", "workload", "pod", "binary"} diff --git a/pkg/metrics/eventmetrics/eventmetrics.go b/pkg/metrics/eventmetrics/eventmetrics.go index 3a4afb826e6..69c15a517c7 100644 --- a/pkg/metrics/eventmetrics/eventmetrics.go +++ b/pkg/metrics/eventmetrics/eventmetrics.go @@ -25,7 +25,7 @@ var ( Name: "events_total", Help: "The total number of Tetragon events", ConstLabels: nil, - }, []string{"type", "namespace", "workload", "pod", "binary"}) + }, metrics.FilterMetricLabels(append([]string{"type"}, consts.KnownMetricLabelFilters...)...)) FlagCount = prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: consts.MetricsNamespace, Name: "flags_total", @@ -44,7 +44,7 @@ var ( Name: "policy_events_total", Help: "Policy events calls observed.", ConstLabels: nil, - }, []string{"policy", "hook", "namespace", "workload", "pod", "binary"}) + }, metrics.FilterMetricLabels(append([]string{"policy", "hook"}, consts.KnownMetricLabelFilters...)...)) ) func InitMetrics(registry *prometheus.Registry) { @@ -93,10 +93,10 @@ func handleProcessedEvent(pInfo *tracingpolicy.PolicyInfo, processedEvent interf default: eventType = "unknown" } - EventsProcessed.WithLabelValues(eventType, namespace, workload, pod, binary).Inc() + EventsProcessed.WithLabelValues(metrics.FilterMetricLabels(eventType, namespace, workload, pod, binary)...).Inc() if pInfo != nil && pInfo.Name != "" { policyStats. - WithLabelValues(pInfo.Name, pInfo.Hook, namespace, workload, pod, binary). + WithLabelValues(metrics.FilterMetricLabels(pInfo.Name, pInfo.Hook, namespace, workload, pod, binary)...). Inc() } } diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 7c232de0b41..66f8faa6860 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -9,6 +9,8 @@ import ( "time" "github.com/cilium/tetragon/pkg/logger" + "github.com/cilium/tetragon/pkg/metrics/consts" + "github.com/cilium/tetragon/pkg/option" "github.com/cilium/tetragon/pkg/podhooks" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" @@ -142,3 +144,20 @@ func EnableMetrics(address string) { http.Handle("/metrics", promhttp.HandlerFor(reg, promhttp.HandlerOpts{Registry: reg})) http.ListenAndServe(address, nil) } + +// The FilterMetricLabels func takes in string arguments and returns a slice of those strings omitting the labels it is not configured for. +// IMPORTANT! The filtered metric labels must be passed last and in the exact order of consts.KnownMetricLabelFilters. +func FilterMetricLabels(labels ...string) []string { + offset := len(labels) - len(consts.KnownMetricLabelFilters) + if offset < 0 { + // Uh-oh. + return labels + } + result := labels[:offset] + for i, label := range consts.KnownMetricLabelFilters { + if _, ok := option.Config.MetricsLabelFilter[label]; ok { + result = append(result, labels[offset+i]) + } + } + return result +} diff --git a/pkg/metrics/metrics_test.go b/pkg/metrics/metrics_test.go index 82f613471ed..6ee1a27c9a4 100644 --- a/pkg/metrics/metrics_test.go +++ b/pkg/metrics/metrics_test.go @@ -18,12 +18,41 @@ import ( "github.com/cilium/tetragon/pkg/metrics" "github.com/cilium/tetragon/pkg/metrics/config" "github.com/cilium/tetragon/pkg/metrics/eventmetrics" + "github.com/cilium/tetragon/pkg/option" ) var sampleMsgGenericTracepointUnix = tracing.MsgGenericTracepointUnix{ PolicyName: "fake-policy", } +func TestFilterMetricLabels(t *testing.T) { + option.Config.MetricsLabelFilter = map[string]interface{}{ + "namespace": nil, + "workload": nil, + "pod": nil, + "binary": nil, + } + assert.Equal(t, []string{"type", "namespace", "workspace", "pod", "binary"}, metrics.FilterMetricLabels("type", "namespace", "workspace", "pod", "binary")) + assert.Equal(t, []string{"syscall", "namespace", "workspace", "pod", "binary"}, metrics.FilterMetricLabels("syscall", "namespace", "workspace", "pod", "binary")) + assert.Equal(t, []string{"namespace", "workspace", "pod", "binary"}, metrics.FilterMetricLabels("namespace", "workspace", "pod", "binary")) + + option.Config.MetricsLabelFilter = map[string]interface{}{ + "namespace": nil, + "workload": nil, + } + assert.Equal(t, []string{"type", "namespace", "workspace"}, metrics.FilterMetricLabels("type", "namespace", "workspace", "pod", "binary")) + assert.Equal(t, []string{"syscall", "namespace", "workspace"}, metrics.FilterMetricLabels("syscall", "namespace", "workspace", "pod", "binary")) + assert.Equal(t, []string{"namespace", "workspace"}, metrics.FilterMetricLabels("namespace", "workspace", "pod", "binary")) + + option.Config.MetricsLabelFilter = map[string]interface{}{ + "namespace": nil, + "workload": nil, + "pod": nil, + "binary": nil, + } + assert.Equal(t, []string{"type", "syscall"}, metrics.FilterMetricLabels("type", "syscall")) +} + func TestPodDelete(t *testing.T) { reg := metrics.GetRegistry() config.InitAllMetrics(reg) diff --git a/pkg/metrics/syscallmetrics/syscallmetrics.go b/pkg/metrics/syscallmetrics/syscallmetrics.go index fff33972854..a39d2d2ecaf 100644 --- a/pkg/metrics/syscallmetrics/syscallmetrics.go +++ b/pkg/metrics/syscallmetrics/syscallmetrics.go @@ -17,7 +17,7 @@ var ( Name: "syscalls_total", Help: "System calls observed.", ConstLabels: nil, - }, []string{"syscall", "namespace", "workload", "pod", "binary"}) + }, metrics.FilterMetricLabels(append([]string{"syscall"}, consts.KnownMetricLabelFilters...)...)) ) func InitMetrics(registry *prometheus.Registry) { @@ -46,7 +46,7 @@ func Handle(event interface{}) { } if syscall != "" { - syscallStats.WithLabelValues(syscall, namespace, workload, pod, binary).Inc() + syscallStats.WithLabelValues(metrics.FilterMetricLabels(syscall, namespace, workload, pod, binary)...).Inc() } } diff --git a/pkg/option/config.go b/pkg/option/config.go index 20789896753..06dadb6723c 100644 --- a/pkg/option/config.go +++ b/pkg/option/config.go @@ -11,6 +11,7 @@ import ( "time" "github.com/cilium/tetragon/pkg/logger" + "github.com/cilium/tetragon/pkg/metrics/consts" "github.com/spf13/viper" ) @@ -46,10 +47,11 @@ type config struct { ProcessCacheSize int DataCacheSize int - MetricsServer string - ServerAddress string - TracingPolicy string - TracingPolicyDir string + MetricsServer string + MetricsLabelFilter map[string]interface{} + ServerAddress string + TracingPolicy string + TracingPolicyDir string ExportFilename string ExportFileMaxSizeMB int @@ -93,6 +95,15 @@ var ( // LogOpts contains logger parameters LogOpts: make(map[string]string), + + // Default to logging metrics with the greatest granularity. + MetricsLabelFilter: func() map[string]interface{} { + result := make(map[string]interface{}) + for _, label := range consts.KnownMetricLabelFilters { + result[label] = nil + } + return result + }(), } )