Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Add tuning options for federation client backoff #5556

Closed
wants to merge 20 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/5556.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Synapse's federation backoff behaviour can now be tuned using the new `federation_backoff` settings.
65 changes: 65 additions & 0 deletions docs/sample_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,71 @@ federation_ip_range_blacklist:
- 'fe80::/64'
- 'fc00::/7'

# Federation Backoff Tuning
#
# These options control what Synapse will consider an unrecoverable
# network error as a federation client. Unrecoverable network errors
# trigger immediate backoff. Setting these options too aggressively may
# cause your Synapse to consider other servers down due to temporary
# networking hiccups, causing outward federation delays for the duration
# of the backoff period.
#
# These options may be useful on more constrained instances that are in
# rooms with lots of servers.
#
# The options are:
#
# dns_resolution: DNS lookup failure (NXDOMAIN and others) will cause
# immediate backoff. Default: False.
#
# dns_servfail: DNS lookup failures where the server cannot process
# the request (SERVFAIL) will cause immediate backoff. This can be
# triggered either by temporary outages on your DNS server, or
# failure of the DNS server responsible for the target domain.
# Only enable this if you have reliable DNS servers.
# Default: False.
#
# no_route_to_host: EHOSTUNREACH errors will cause immediate backoff.
# This can be caused by DNS resolving to unroutable IP addresses.
# Default: False.
#
# refused_connection: ECONNREFUSED errors will cause immediate backoff.
# This can result in temporary outages (e.g. restarts of other
# servers) triggering backoff and subsequent federation delays.
# Default: False.
#
# cannot_assign_address: Being unable to assign an address when binding
# a socket will cause immediate backoff. Can be caused by IPv4/v6
# misconfiguration.
# Default: False.
#
# invalid_tls: Invalid TLS negotiation or a TLS certificate failure
# will cause immediate backoff. This is generally safe to enable,
# but it will cause Synapse to not check for a new valid TLS
# setup until the end of the backoff, which may cause a delay in
# re-establishing federation with servers that fix their TLS.
# Default: False.
#
# on_timeout: A timeout will cause immediate backoff. Timeouts can be
# caused by various reasons (overloading of the target server,
# network degradation) and may cause Synapse to consider a
# temporarily overloaded server as down, and cause subsequent
# federation delays. Default: False.
#
# timeout_amount: How long Synapse will wait for before timing out
# federation client requests. This accepts a human-readable time
# (e.g. "60s") or a flat number of milliseconds. Default: 60s.
#
#federation_backoff:
# dns_resolution: False
# dns_servfail: False
# no_route_to_host: False
# refused_connection: False
# cannot_assign_address: False
# invalid_tls: False
# on_timeout: False
# timeout_amount: "60s"

# List of ports that Synapse should listen on, their purpose and their
# configuration.
#
Expand Down
85 changes: 85 additions & 0 deletions synapse/config/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import logging
import os.path

import attr
from netaddr import IPSet

from synapse.api.room_versions import KNOWN_ROOM_VERSIONS
Expand Down Expand Up @@ -324,6 +325,25 @@ def read_config(self, config, **kwargs):
"cleanup_extremities_with_dummy_events", False
)

@attr.s
class FederationBackoffSettings(object):
dns_resolution = attr.ib(default=False, type=bool)
dns_servfail = attr.ib(default=False, type=bool)
no_route_to_host = attr.ib(default=False, type=bool)
refused_connection = attr.ib(default=False, type=bool)
cannot_assign_address = attr.ib(default=False, type=bool)
invalid_tls = attr.ib(default=False, type=bool)
timeout_amount = attr.ib(
default="60s", converter=self.parse_duration, type=int
)
on_timeout = attr.ib(default=False, type=bool)

federation_backoff_settings = config.get("federation_backoff", {})

self.federation_backoff_settings = FederationBackoffSettings(
**federation_backoff_settings
)

def has_tls_listener(self):
return any(l["tls"] for l in self.listeners)

Expand Down Expand Up @@ -465,6 +485,71 @@ def generate_config_section(
- 'fe80::/64'
- 'fc00::/7'

# Federation Backoff Tuning
#
# These options control what Synapse will consider an unrecoverable
# network error as a federation client. Unrecoverable network errors
# trigger immediate backoff. Setting these options too aggressively may
# cause your Synapse to consider other servers down due to temporary
# networking hiccups, causing outward federation delays for the duration
# of the backoff period.
#
# These options may be useful on more constrained instances that are in
# rooms with lots of servers.
#
# The options are:
#
# dns_resolution: DNS lookup failure (NXDOMAIN and others) will cause
# immediate backoff. Default: False.
#
# dns_servfail: DNS lookup failures where the server cannot process
# the request (SERVFAIL) will cause immediate backoff. This can be
# triggered either by temporary outages on your DNS server, or
# failure of the DNS server responsible for the target domain.
# Only enable this if you have reliable DNS servers.
# Default: False.
#
# no_route_to_host: EHOSTUNREACH errors will cause immediate backoff.
# This can be caused by DNS resolving to unroutable IP addresses.
# Default: False.
#
# refused_connection: ECONNREFUSED errors will cause immediate backoff.
# This can result in temporary outages (e.g. restarts of other
# servers) triggering backoff and subsequent federation delays.
# Default: False.
#
# cannot_assign_address: Being unable to assign an address when binding
# a socket will cause immediate backoff. Can be caused by IPv4/v6
# misconfiguration.
# Default: False.
#
# invalid_tls: Invalid TLS negotiation or a TLS certificate failure
# will cause immediate backoff. This is generally safe to enable,
# but it will cause Synapse to not check for a new valid TLS
# setup until the end of the backoff, which may cause a delay in
# re-establishing federation with servers that fix their TLS.
# Default: False.
#
# on_timeout: A timeout will cause immediate backoff. Timeouts can be
# caused by various reasons (overloading of the target server,
# network degradation) and may cause Synapse to consider a
# temporarily overloaded server as down, and cause subsequent
# federation delays. Default: False.
#
# timeout_amount: How long Synapse will wait for before timing out
# federation client requests. This accepts a human-readable time
# (e.g. "60s") or a flat number of milliseconds. Default: 60s.
#
#federation_backoff:
# dns_resolution: False
# dns_servfail: False
# no_route_to_host: False
# refused_connection: False
# cannot_assign_address: False
# invalid_tls: False
# on_timeout: False
# timeout_amount: "60s"

# List of ports that Synapse should listen on, their purpose and their
# configuration.
#
Expand Down
2 changes: 2 additions & 0 deletions synapse/federation/transport/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class TransportLayerClient(object):
def __init__(self, hs):
self.server_name = hs.hostname
self.client = hs.get_http_client()
self.backoff_settings = hs.config.federation_backoff_settings

@log_function
def get_room_state(self, destination, room_id, event_id):
Expand Down Expand Up @@ -181,6 +182,7 @@ def send_transaction(self, transaction, json_data_callback=None):
long_retries=True,
backoff_on_404=True, # If we get a 404 the other side has gone
try_trailing_slash_on_400=True,
retry_on_dns_fail=not self.backoff_settings.dns_resolution,
)

defer.returnValue(response)
Expand Down
Loading