From 6934360fed26afbd2586f36af2e795b4b128892a Mon Sep 17 00:00:00 2001 From: Xavier Vello Date: Fri, 31 Aug 2018 18:21:05 +0200 Subject: [PATCH] Limit Prometheus/OpenMetrics checks to 2000 metrics per run by default (#2093) * allow checks to limit the number of metric contexts they submit * set limit for prom checks to 2000 * set limits on all prom children checks to 0 (unlimited) * make the metric limit configurable * do not allow to disable limit if class has set one --- .../datadog_checks/checks/base.py | 53 ++++++++++++- .../checks/openmetrics/base_check.py | 2 + .../checks/prometheus/base_check.py | 2 + .../checks/prometheus/prometheus_base.py | 2 + .../datadog_checks/utils/limiter.py | 67 ++++++++++++++++ datadog_checks_base/tests/test_agent_check.py | 78 +++++++++++++++++++ datadog_checks_base/tests/test_utils.py | 55 +++++++++++++ gitlab/datadog_checks/gitlab/gitlab.py | 1 + .../gitlab_runner/gitlab_runner.py | 1 + istio/datadog_checks/istio/istio.py | 1 + kube_dns/datadog_checks/kube_dns/kube_dns.py | 2 + .../datadog_checks/kube_proxy/kube_proxy.py | 1 + kubelet/datadog_checks/kubelet/kubelet.py | 1 + .../kubernetes_state/kubernetes_state.py | 2 + linkerd/datadog_checks/linkerd/linkerd.py | 2 + prometheus/README.md | 5 ++ .../prometheus/data/conf.yaml.example | 3 + 17 files changed, 277 insertions(+), 1 deletion(-) create mode 100644 datadog_checks_base/datadog_checks/utils/limiter.py diff --git a/datadog_checks_base/datadog_checks/checks/base.py b/datadog_checks_base/datadog_checks/checks/base.py index 595604cf039d2..487a2ee5d59ad 100644 --- a/datadog_checks_base/datadog_checks/checks/base.py +++ b/datadog_checks_base/datadog_checks/checks/base.py @@ -26,6 +26,15 @@ from ..config import is_affirmative from ..utils.common import ensure_bytes from ..utils.proxy import config_proxy_skip +from ..utils.limiter import Limiter + + +# Metric types for which it's only useful to submit once per context +ONE_PER_CONTEXT_METRIC_TYPES = [ + aggregator.GAUGE, + aggregator.RATE, + aggregator.MONOTONIC_COUNT, +] class AgentCheck(object): @@ -34,6 +43,18 @@ class AgentCheck(object): """ OK, WARNING, CRITICAL, UNKNOWN = (0, 1, 2, 3) + """ + DEFAULT_METRIC_LIMIT allows to set a limit on metric contexts this check can send + per run. This is useful for check that have an unbounded number of contexts, + depending on the input payload. + The logic counts one context per gauge/rate/monotonic_count call, and deduplicates + contexts for other metric types. The first N contexts in submission order will + be sent to the aggregator, the rest are dropped. The state is reset after each run. + + See https://github.com/DataDog/integrations-core/pull/2093 for more information + """ + DEFAULT_METRIC_LIMIT = 0 + def __init__(self, *args, **kwargs): """ args: `name`, `init_config`, `agentConfig` (deprecated), `instances` @@ -45,6 +66,7 @@ def __init__(self, *args, **kwargs): self.init_config = kwargs.get('init_config', {}) self.agentConfig = kwargs.get('agentConfig', {}) self.warnings = [] + self.metric_limiter = None if len(args) > 0: self.name = args[0] @@ -98,6 +120,19 @@ def __init__(self, *args, **kwargs): ], } + # Setup metric limits + try: + metric_limit = self.instances[0].get("max_returned_metrics", self.DEFAULT_METRIC_LIMIT) + # Do not allow to disable limiting if the class has set a non-zero default value + if metric_limit == 0 and self.DEFAULT_METRIC_LIMIT > 0: + metric_limit = self.DEFAULT_METRIC_LIMIT + self.warning("Setting max_returned_metrics to zero is not allowed," + + "reverting to the default of {} metrics".format(self.DEFAULT_METRIC_LIMIT)) + except Exception: + metric_limit = self.DEFAULT_METRIC_LIMIT + if metric_limit > 0: + self.metric_limiter = Limiter("metrics", metric_limit, self.warning) + @property def in_developer_mode(self): self._log_deprecation('in_developer_mode') @@ -117,6 +152,9 @@ def get_instance_proxy(self, instance, uri, proxies=None): return config_proxy_skip(proxies, uri, skip) + def _context_uid(mtype, name, value, tags=None, hostname=None): + return '{}-{}-{}-{}'.format(mtype, name, tags if tags is None else hash(frozenset(tags)), hostname) + def _submit_metric(self, mtype, name, value, tags=None, hostname=None, device_name=None): if value is None: # ignore metric sample @@ -126,6 +164,17 @@ def _submit_metric(self, mtype, name, value, tags=None, hostname=None, device_na if hostname is None: hostname = b'' + if self.metric_limiter: + if mtype in ONE_PER_CONTEXT_METRIC_TYPES: + # Fast path for gauges, rates, monotonic counters, assume one context per call + if self.metric_limiter.is_reached(): + return + else: + # Other metric types have a legit use case for several calls per context, track unique contexts + context = self._context_uid(mtype, name, tags, hostname) + if self.metric_limiter.is_reached(context): + return + aggregator.submit_metric(self, self.check_id, mtype, ensure_bytes(name), float(value), tags, hostname) def gauge(self, name, value, tags=None, hostname=None, device_name=None): @@ -302,7 +351,6 @@ def run(self): try: self.check(copy.deepcopy(self.instances[0])) result = b'' - except Exception as e: result = json.dumps([ { @@ -310,6 +358,9 @@ def run(self): "traceback": traceback.format_exc(), } ]) + finally: + if self.metric_limiter: + self.metric_limiter.reset() return result diff --git a/datadog_checks_base/datadog_checks/checks/openmetrics/base_check.py b/datadog_checks_base/datadog_checks/checks/openmetrics/base_check.py index 23abf641c4f8f..808d63b4ddd3b 100644 --- a/datadog_checks_base/datadog_checks/checks/openmetrics/base_check.py +++ b/datadog_checks_base/datadog_checks/checks/openmetrics/base_check.py @@ -20,6 +20,8 @@ class OpenMetricsBaseCheck(OpenMetricsScraperMixin, AgentCheck): - bar - foo """ + DEFAULT_METRIC_LIMIT = 2000 + def __init__(self, name, init_config, agentConfig, instances=None, default_instances=None, default_namespace=None): super(OpenMetricsBaseCheck, self).__init__(name, init_config, agentConfig, instances=instances) self.config_map = {} diff --git a/datadog_checks_base/datadog_checks/checks/prometheus/base_check.py b/datadog_checks_base/datadog_checks/checks/prometheus/base_check.py index 1a3ee6c8f9a89..b1a350afe3c30 100644 --- a/datadog_checks_base/datadog_checks/checks/prometheus/base_check.py +++ b/datadog_checks_base/datadog_checks/checks/prometheus/base_check.py @@ -84,6 +84,8 @@ class GenericPrometheusCheck(AgentCheck): - bar - foo """ + DEFAULT_METRIC_LIMIT = 2000 + def __init__(self, name, init_config, agentConfig, instances=None, default_instances=None, default_namespace=""): super(GenericPrometheusCheck, self).__init__(name, init_config, agentConfig, instances) self.scrapers_map = {} diff --git a/datadog_checks_base/datadog_checks/checks/prometheus/prometheus_base.py b/datadog_checks_base/datadog_checks/checks/prometheus/prometheus_base.py index 8200c8fb49311..f47ed1a149d8b 100644 --- a/datadog_checks_base/datadog_checks/checks/prometheus/prometheus_base.py +++ b/datadog_checks_base/datadog_checks/checks/prometheus/prometheus_base.py @@ -22,6 +22,8 @@ # class PrometheusCheck(PrometheusScraperMixin, AgentCheck): + DEFAULT_METRIC_LIMIT = 2000 + def __init__(self, name, init_config, agentConfig, instances=None): super(PrometheusCheck, self).__init__(name, init_config, agentConfig, instances) diff --git a/datadog_checks_base/datadog_checks/utils/limiter.py b/datadog_checks_base/datadog_checks/utils/limiter.py new file mode 100644 index 0000000000000..348589493699e --- /dev/null +++ b/datadog_checks_base/datadog_checks/utils/limiter.py @@ -0,0 +1,67 @@ +# (C) Datadog, Inc. 2018 +# All rights reserved +# Licensed under Simplified BSD License (see LICENSE) + + +class Limiter(object): + """ + Limiter implements a simple cut-off capping logic for object count. + It is used by the AgentCheck class to limit the number of metric contexts + that can be set by an instance. + """ + def __init__(self, object_name, object_limit, warning_func=None): + """ + :param object_name: (plural) name of counted objects for warning wording + :param object_limit: maximum number of objects to accept before limiting + :param warning_func: callback function, called with a string when limit is exceeded + """ + self.warning = warning_func + self.name = object_name + self.limit = object_limit + + self.reached_limit = False + self.count = 0 + self.seen = set() + + def reset(self): + """ + Resets state and uid set. To be called asap to free memory + """ + self.reached_limit = False + self.count = 0 + self.seen.clear() + + def is_reached(self, uid=None): + """ + is_reached is to be called for every object that counts towards the limit. + - When called with no uid, the Limiter assumes this is a new object and + unconditionally increments the counter (less CPU and memory usage). + - When a given object can be passed multiple times, a uid must be provided to + deduplicate calls. Only the first occurrence of a uid will increment the counter. + + :param uid: (optional) unique identifier of the object, to deduplicate calls + :returns: boolean, true if limit exceeded + """ + if self.reached_limit: + return True + + if uid: + if uid in self.seen: + return False + self.count += 1 + self.seen.add(uid) + else: + self.count += 1 + + if self.count > self.limit: + if self.warning: + self.warning("Exceeded limit of {} {}, ignoring next ones".format(self.limit, self.name)) + self.reached_limit = True + return True + return False + + def get_status(self): + """ + Returns the internal state of the limiter for unit tests + """ + return (self.count, self.limit, self.reached_limit) diff --git a/datadog_checks_base/tests/test_agent_check.py b/datadog_checks_base/tests/test_agent_check.py index 3b25768b7b6b1..16958fb8e4448 100644 --- a/datadog_checks_base/tests/test_agent_check.py +++ b/datadog_checks_base/tests/test_agent_check.py @@ -1,9 +1,18 @@ # (C) Datadog, Inc. 2018 # All rights reserved # Licensed under a 3-clause BSD style license (see LICENSE) +import pytest + from datadog_checks.checks import AgentCheck +@pytest.fixture +def aggregator(): + from datadog_checks.stubs import aggregator + aggregator.reset() + return aggregator + + def test_instance(): """ Simply assert the class can be insantiated @@ -45,3 +54,72 @@ def test_unicode_string(self): assert normalized_tags is not tags assert normalized_tag == tag.encode('utf-8') + + +class LimitedCheck(AgentCheck): + DEFAULT_METRIC_LIMIT = 10 + + +class TestLimits(): + def test_metric_limit_gauges(self, aggregator): + check = LimitedCheck() + assert check.get_warnings() == [] + + for i in range(0, 10): + check.gauge("metric", 0) + assert len(check.get_warnings()) == 0 + assert len(aggregator.metrics("metric")) == 10 + + for i in range(0, 10): + check.gauge("metric", 0) + assert len(check.get_warnings()) == 1 + assert len(aggregator.metrics("metric")) == 10 + + def test_metric_limit_count(self, aggregator): + check = LimitedCheck() + assert check.get_warnings() == [] + + # Multiple calls for a single context should not trigger + for i in range(0, 20): + check.count("metric", 0, hostname="host-single") + assert len(check.get_warnings()) == 0 + assert len(aggregator.metrics("metric")) == 20 + + # Multiple contexts should trigger + # Only 9 new contexts should pass through + for i in range(0, 20): + check.count("metric", 0, hostname="host-{}".format(i)) + assert len(check.get_warnings()) == 1 + assert len(aggregator.metrics("metric")) == 29 + + def test_metric_limit_instance_config(self, aggregator): + instances = [ + { + "max_returned_metrics": 42, + } + ] + check = AgentCheck("test", {}, instances) + assert check.get_warnings() == [] + + for i in range(0, 42): + check.gauge("metric", 0) + assert len(check.get_warnings()) == 0 + assert len(aggregator.metrics("metric")) == 42 + + check.gauge("metric", 0) + assert len(check.get_warnings()) == 1 + assert len(aggregator.metrics("metric")) == 42 + + def test_metric_limit_instance_config_zero(self, aggregator): + instances = [ + { + "max_returned_metrics": 0, + } + ] + check = LimitedCheck("test", {}, instances) + assert len(check.get_warnings()) == 1 + + for i in range(0, 42): + check.gauge("metric", 0) + assert len(check.get_warnings()) == 1 # get_warnings resets the array + assert len(aggregator.metrics("metric")) == 10 diff --git a/datadog_checks_base/tests/test_utils.py b/datadog_checks_base/tests/test_utils.py index ee10b212675d6..f86669f1a00ab 100644 --- a/datadog_checks_base/tests/test_utils.py +++ b/datadog_checks_base/tests/test_utils.py @@ -1,7 +1,9 @@ # (C) Datadog, Inc. 2018 # All rights reserved # Licensed under a 3-clause BSD style license (see LICENSE) + from datadog_checks.utils.common import pattern_filter +from datadog_checks.utils.limiter import Limiter class Item: @@ -50,3 +52,56 @@ def test_key_function(self): assert pattern_filter(items, whitelist=whitelist, key=lambda item: item.name) == [ Item('abc'), Item('def'), Item('abcdef') ] + + +class TestLimiter(): + def test_no_uid(self): + warnings = [] + limiter = Limiter("names", 10, warning_func=warnings.append) + for i in range(0, 10): + assert limiter.is_reached() is False + assert limiter.get_status() == (10, 10, False) + + # Reach limit + assert limiter.is_reached() is True + assert limiter.get_status() == (11, 10, True) + assert warnings == ["Exceeded limit of 10 names, ignoring next ones"] + + # Make sure warning is only sent once + assert limiter.is_reached() is True + assert len(warnings) == 1 + + def test_with_uid(self): + warnings = [] + limiter = Limiter("names", 10, warning_func=warnings.append) + for i in range(0, 20): + assert limiter.is_reached("dummy1") is False + assert limiter.get_status() == (1, 10, False) + + for i in range(0, 20): + assert limiter.is_reached("dummy2") is False + assert limiter.get_status() == (2, 10, False) + assert len(warnings) == 0 + + def test_mixed(self): + limiter = Limiter("names", 10) + + for i in range(0, 20): + assert limiter.is_reached("dummy1") is False + assert limiter.get_status() == (1, 10, False) + + for i in range(0, 5): + assert limiter.is_reached() is False + assert limiter.get_status() == (6, 10, False) + + def test_reset(self): + limiter = Limiter("names", 10) + + for i in range(1, 20): + limiter.is_reached("dummy1") + assert limiter.get_status() == (1, 10, False) + + limiter.reset() + assert limiter.get_status() == (0, 10, False) + assert limiter.is_reached("dummy1") is False + assert limiter.get_status() == (1, 10, False) diff --git a/gitlab/datadog_checks/gitlab/gitlab.py b/gitlab/datadog_checks/gitlab/gitlab.py index 56c25b39f7b79..89bd5b63e37c0 100644 --- a/gitlab/datadog_checks/gitlab/gitlab.py +++ b/gitlab/datadog_checks/gitlab/gitlab.py @@ -24,6 +24,7 @@ class GitlabCheck(PrometheusCheck): EVENT_TYPE = SOURCE_TYPE_NAME = 'gitlab' DEFAULT_CONNECT_TIMEOUT = 5 DEFAULT_RECEIVE_TIMEOUT = 15 + DEFAULT_METRIC_LIMIT = 0 PROMETHEUS_SERVICE_CHECK_NAME = 'gitlab.prometheus_endpoint_up' diff --git a/gitlab_runner/datadog_checks/gitlab_runner/gitlab_runner.py b/gitlab_runner/datadog_checks/gitlab_runner/gitlab_runner.py index a275a2b5d7d90..d3a0dab23b726 100644 --- a/gitlab_runner/datadog_checks/gitlab_runner/gitlab_runner.py +++ b/gitlab_runner/datadog_checks/gitlab_runner/gitlab_runner.py @@ -25,6 +25,7 @@ class GitlabRunnerCheck(PrometheusCheck): DEFAULT_CONNECT_TIMEOUT = 5 DEFAULT_RECEIVE_TIMEOUT = 15 + DEFAULT_METRIC_LIMIT = 0 def __init__(self, name, init_config, agentConfig, instances=None): super(GitlabRunnerCheck, self).__init__(name, init_config, agentConfig, instances) diff --git a/istio/datadog_checks/istio/istio.py b/istio/datadog_checks/istio/istio.py index a5ba2beb441d1..a20a12806e3bb 100644 --- a/istio/datadog_checks/istio/istio.py +++ b/istio/datadog_checks/istio/istio.py @@ -8,6 +8,7 @@ class Istio(PrometheusCheck): MIXER_NAMESPACE = 'istio.mixer' MESH_NAMESPACE = 'istio.mesh' + DEFAULT_METRIC_LIMIT = 0 def __init__(self, name, init_config, agentConfig, instances=None): super(Istio, self).__init__(name, init_config, agentConfig, instances) diff --git a/kube_dns/datadog_checks/kube_dns/kube_dns.py b/kube_dns/datadog_checks/kube_dns/kube_dns.py index 0f576f1573441..3619b7158e5c4 100644 --- a/kube_dns/datadog_checks/kube_dns/kube_dns.py +++ b/kube_dns/datadog_checks/kube_dns/kube_dns.py @@ -12,6 +12,8 @@ class KubeDNSCheck(PrometheusCheck): """ Collect kube-dns metrics from Prometheus """ + DEFAULT_METRIC_LIMIT = 0 + def __init__(self, name, init_config, agentConfig, instances=None): super(KubeDNSCheck, self).__init__(name, init_config, agentConfig, instances) self.NAMESPACE = 'kubedns' diff --git a/kube_proxy/datadog_checks/kube_proxy/kube_proxy.py b/kube_proxy/datadog_checks/kube_proxy/kube_proxy.py index b92c707887001..c56cfcc0aa595 100644 --- a/kube_proxy/datadog_checks/kube_proxy/kube_proxy.py +++ b/kube_proxy/datadog_checks/kube_proxy/kube_proxy.py @@ -4,6 +4,7 @@ from datadog_checks.checks.prometheus import GenericPrometheusCheck class KubeProxyCheck(GenericPrometheusCheck): + DEFAULT_METRIC_LIMIT = 0 def __init__(self, name, init_config, agentConfig, instances=None): super(KubeProxyCheck, self).__init__( diff --git a/kubelet/datadog_checks/kubelet/kubelet.py b/kubelet/datadog_checks/kubelet/kubelet.py index ff3880e8eaec3..9eee09b6f69db 100644 --- a/kubelet/datadog_checks/kubelet/kubelet.py +++ b/kubelet/datadog_checks/kubelet/kubelet.py @@ -55,6 +55,7 @@ class KubeletCheck(AgentCheck, CadvisorScraper): """ Collect metrics from Kubelet. """ + DEFAULT_METRIC_LIMIT = 0 def __init__(self, name, init_config, agentConfig, instances=None): super(KubeletCheck, self).__init__(name, init_config, agentConfig, instances) diff --git a/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py b/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py index c4698973b1e16..43fd715d9899b 100644 --- a/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py +++ b/kubernetes_state/datadog_checks/kubernetes_state/kubernetes_state.py @@ -22,6 +22,8 @@ class KubernetesState(PrometheusCheck): Collect kube-state-metrics metrics in the Prometheus format See https://github.com/kubernetes/kube-state-metrics """ + DEFAULT_METRIC_LIMIT = 0 + def __init__(self, name, init_config, agentConfig, instances=None): super(KubernetesState, self).__init__(name, init_config, agentConfig, instances) self.NAMESPACE = 'kubernetes_state' diff --git a/linkerd/datadog_checks/linkerd/linkerd.py b/linkerd/datadog_checks/linkerd/linkerd.py index 384006b593bac..437be417cbe7a 100644 --- a/linkerd/datadog_checks/linkerd/linkerd.py +++ b/linkerd/datadog_checks/linkerd/linkerd.py @@ -10,6 +10,8 @@ class LinkerdCheck(GenericPrometheusCheck): """ Collect linkerd metrics from Prometheus """ + DEFAULT_METRIC_LIMIT = 0 + def __init__(self, name, init_config, agentConfig, instances=None): labels_mapper = { 'rt' : 'linkerd_router', diff --git a/prometheus/README.md b/prometheus/README.md index 01b56bc96d7b9..4d919e3c6e55c 100644 --- a/prometheus/README.md +++ b/prometheus/README.md @@ -26,6 +26,11 @@ There is also a couple of more advanced settings (ssl, labels joining, custom ta If you are monitoring an off-the-shelf software and you think it would deserve an official integration, have a look at `kube-proxy` for an example, and don't hesitate to contribute. +Due to the nature of this integration, it is possible to submit a high number of custom metrics +to Datadog. To provide users control over the maximum number of metrics sent in the case of +configuration errors or input changes, the check has a default limit of 2000 metrics. +You can increase this limit, if needed, by setting the `max_returned_metrics` option. + ### Validation [Run the Agent's `status` subcommand][1] and look for `prometheus` under the Checks section. diff --git a/prometheus/datadog_checks/prometheus/data/conf.yaml.example b/prometheus/datadog_checks/prometheus/data/conf.yaml.example index 198054240125b..91b22da0ec8ba 100644 --- a/prometheus/datadog_checks/prometheus/data/conf.yaml.example +++ b/prometheus/datadog_checks/prometheus/data/conf.yaml.example @@ -82,5 +82,8 @@ instances: # The path to the trusted CA used for generating custom certificates # ssl_ca_cert: "/path/to/cacert" + # The check limits itself to 2000 metrics by default, you can increase this limit if needed + # max_returned_metrics: 2000 + # Set a timeout for the prometheus query, defaults to 10 # prometheus_timeout: 10