From 325cadc8c4a64137a93aaf8ea199cc36e6fa30a3 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Mon, 15 Aug 2022 21:46:07 -0500 Subject: [PATCH 1/4] Add metrics to track how the rate limiter is affecting requests Related to https://github.com/matrix-org/synapse/pull/13499 Mentioned in https://docs.google.com/document/d/1lvUoVfYUiy6UaHB6Rb4HicjaJAU40-APue9Q4vzuW3c/edit#bookmark=id.zjko88lr25j --- synapse/util/ratelimitutils.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py index 6394cc39ac02..fa6f63c77846 100644 --- a/synapse/util/ratelimitutils.py +++ b/synapse/util/ratelimitutils.py @@ -20,6 +20,8 @@ from twisted.internet import defer +from prometheus_client.core import Counter + from synapse.api.errors import LimitExceededError from synapse.config.ratelimiting import FederationRatelimitSettings from synapse.logging.context import ( @@ -35,6 +37,11 @@ logger = logging.getLogger(__name__) +rate_limit_sleep_counter = Counter("synapse_rate_limit_sleep", "", ["host"]) + +rate_limit_reject_counter = Counter("synapse_rate_limit_reject", "", ["host"]) + + class FederationRateLimiter: def __init__(self, clock: Clock, config: FederationRatelimitSettings): def new_limiter() -> "_PerHostRatelimiter": @@ -59,7 +66,7 @@ def ratelimit(self, host: str) -> "_GeneratorContextManager[defer.Deferred[None] Returns: context manager which returns a deferred. """ - return self.ratelimiters[host].ratelimit() + return self.ratelimiters[host].ratelimit(host) class _PerHostRatelimiter: @@ -94,12 +101,14 @@ def __init__(self, clock: Clock, config: FederationRatelimitSettings): self.request_times: List[int] = [] @contextlib.contextmanager - def ratelimit(self) -> "Iterator[defer.Deferred[None]]": + def ratelimit(self, host: str) -> "Iterator[defer.Deferred[None]]": # `contextlib.contextmanager` takes a generator and turns it into a # context manager. The generator should only yield once with a value # to be returned by manager. # Exceptions will be reraised at the yield. + self.host = host + request_id = object() ret = self._on_enter(request_id) try: @@ -119,6 +128,7 @@ def _on_enter(self, request_id: object) -> "defer.Deferred[None]": # sleeping or in the ready queue). queue_size = len(self.ready_request_queue) + len(self.sleeping_requests) if queue_size > self.reject_limit: + rate_limit_reject_counter.labels(self.host).inc() raise LimitExceededError( retry_after_ms=int(self.window_size / self.sleep_limit) ) @@ -146,6 +156,7 @@ def queue_request() -> "defer.Deferred[None]": if len(self.request_times) > self.sleep_limit: logger.debug("Ratelimiter: sleeping request for %f sec", self.sleep_sec) + rate_limit_sleep_counter.labels(self.host).inc() ret_defer = run_in_background(self.clock.sleep, self.sleep_sec) self.sleeping_requests.add(request_id) From 326731889fce08f75727de04ec2c2b9a5203ee06 Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Mon, 15 Aug 2022 21:51:25 -0500 Subject: [PATCH 2/4] Add changelog --- changelog.d/13534.misc | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/13534.misc diff --git a/changelog.d/13534.misc b/changelog.d/13534.misc new file mode 100644 index 000000000000..b488bf74c389 --- /dev/null +++ b/changelog.d/13534.misc @@ -0,0 +1 @@ +Add metrics to track how the rate limiter is affecting requests (sleep/reject). From 5679bb2b5ae2563ee4ba2edfaa2db1f4a421d0bd Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Mon, 15 Aug 2022 22:00:47 -0500 Subject: [PATCH 3/4] Fix lints --- synapse/util/ratelimitutils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py index fa6f63c77846..8c364304b329 100644 --- a/synapse/util/ratelimitutils.py +++ b/synapse/util/ratelimitutils.py @@ -18,10 +18,10 @@ import typing from typing import Any, DefaultDict, Iterator, List, Set -from twisted.internet import defer - from prometheus_client.core import Counter +from twisted.internet import defer + from synapse.api.errors import LimitExceededError from synapse.config.ratelimiting import FederationRatelimitSettings from synapse.logging.context import ( From 149ac1db8a32f18f9c4ff0c9c7118940328ad34a Mon Sep 17 00:00:00 2001 From: Eric Eastwood Date: Tue, 16 Aug 2022 11:58:08 -0500 Subject: [PATCH 4/4] Remove unbounded host from labels See https://github.com/matrix-org/synapse/pull/13534#discussion_r946496912 --- synapse/util/ratelimitutils.py | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py index 8c364304b329..b668f9d22541 100644 --- a/synapse/util/ratelimitutils.py +++ b/synapse/util/ratelimitutils.py @@ -37,9 +37,9 @@ logger = logging.getLogger(__name__) -rate_limit_sleep_counter = Counter("synapse_rate_limit_sleep", "", ["host"]) - -rate_limit_reject_counter = Counter("synapse_rate_limit_reject", "", ["host"]) +# Track how much the ratelimiter is affecting requests +rate_limit_sleep_counter = Counter("synapse_rate_limit_sleep", "") +rate_limit_reject_counter = Counter("synapse_rate_limit_reject", "") class FederationRateLimiter: @@ -128,7 +128,8 @@ def _on_enter(self, request_id: object) -> "defer.Deferred[None]": # sleeping or in the ready queue). queue_size = len(self.ready_request_queue) + len(self.sleeping_requests) if queue_size > self.reject_limit: - rate_limit_reject_counter.labels(self.host).inc() + logger.debug("Ratelimiter(%s): rejecting request", self.host) + rate_limit_reject_counter.inc() raise LimitExceededError( retry_after_ms=int(self.window_size / self.sleep_limit) ) @@ -140,7 +141,8 @@ def queue_request() -> "defer.Deferred[None]": queue_defer: defer.Deferred[None] = defer.Deferred() self.ready_request_queue[request_id] = queue_defer logger.info( - "Ratelimiter: queueing request (queue now %i items)", + "Ratelimiter(%s): queueing request (queue now %i items)", + self.host, len(self.ready_request_queue), ) @@ -149,20 +151,28 @@ def queue_request() -> "defer.Deferred[None]": return defer.succeed(None) logger.debug( - "Ratelimit [%s]: len(self.request_times)=%d", + "Ratelimit(%s) [%s]: len(self.request_times)=%d", + self.host, id(request_id), len(self.request_times), ) if len(self.request_times) > self.sleep_limit: - logger.debug("Ratelimiter: sleeping request for %f sec", self.sleep_sec) - rate_limit_sleep_counter.labels(self.host).inc() + logger.debug( + "Ratelimiter(%s) [%s]: sleeping request for %f sec", + self.host, + id(request_id), + self.sleep_sec, + ) + rate_limit_sleep_counter.inc() ret_defer = run_in_background(self.clock.sleep, self.sleep_sec) self.sleeping_requests.add(request_id) def on_wait_finished(_: Any) -> "defer.Deferred[None]": - logger.debug("Ratelimit [%s]: Finished sleeping", id(request_id)) + logger.debug( + "Ratelimit(%s) [%s]: Finished sleeping", self.host, id(request_id) + ) self.sleeping_requests.discard(request_id) queue_defer = queue_request() return queue_defer @@ -172,7 +182,9 @@ def on_wait_finished(_: Any) -> "defer.Deferred[None]": ret_defer = queue_request() def on_start(r: object) -> object: - logger.debug("Ratelimit [%s]: Processing req", id(request_id)) + logger.debug( + "Ratelimit(%s) [%s]: Processing req", self.host, id(request_id) + ) self.current_processing.add(request_id) return r @@ -194,7 +206,7 @@ def on_both(r: object) -> object: return make_deferred_yieldable(ret_defer) def _on_exit(self, request_id: object) -> None: - logger.debug("Ratelimit [%s]: Processed req", id(request_id)) + logger.debug("Ratelimit(%s) [%s]: Processed req", self.host, id(request_id)) self.current_processing.discard(request_id) try: # start processing the next item on the queue.