From ecdbe1efb1f5f9d1c59b660f19627dd1d6a075c9 Mon Sep 17 00:00:00 2001 From: Dani Louca <59848726+dloucasfx@users.noreply.github.com> Date: Mon, 11 Dec 2023 13:55:14 -0500 Subject: [PATCH] [exporter/signalfx] enabled http2 healthcheck (#29716) **Description:** This PR enables the HTTP2 health check to workaround the issue described here https://github.com/open-telemetry/opentelemetry-collector/pull/9022 As to why I chose 10 seconds for `HTTP2ReadIdleTimeout` and ~~5 seconds~~ 10 seconds (see review comment) for `HTTP2PingTimeout` Those values have been tested in production and they will result, in an active env (with default http timeout of 10 seconds and default retry settings), of a single export failure at max before the health check detects the corrupted tcp connection and closes it. The only drawback is if the connection was not used for over 10 seconds, we might end up sending unnecessary ping frames, which should not be an issue and if it became an issue, then we can tune those settings. The SFX exporter has multiples http clients: - Metric client, Trace client and Event client . Those client will have the http2 health check enabled by default as they share the same default config - Correlation client and Dimension client will NOT have the http2 health check enabled. We can revisit this if needed. **Testing:** - Run OTEL with one of the exporters that uses HTTP/2 client, example `signalfx` exporter - For simplicity use a single pipeline/exporter - In a different shell, run this to watch the tcp state of the established connection ``` while (true); do echo date; sudo netstat -anp | grep -E '' | sort -k 5; sleep 2; done ``` - From the netstat, take a note of the source port and the source IP address - replace <> from previous step `sudo iptables -A OUTPUT -s -p tcp --sport -j DROP` - Note how the OTEL exporter export starts timing out Expected Result: - A new connection should be established, similarly to http/1 and exports should succeed Actual Result: - The exports keep failing for ~ 15 minutes or for whatever the OS `tcp_retries2` is configured to - After 15 minutes, a new tcp connection is created and exports start working **Documentation:** Readme is updated **Disclaimer:** Not all HTTP/2 servers support H2 Ping, however, this should not be a concern as our ingest servers do support H2 ping. But if you are routing you can check if H2 ping is supported using this script https://github.com/golang/go/issues/60818#issuecomment-1602803365 Signed-off-by: Dani Louca --- .chloggen/sfx-exporter-http2.yaml | 27 ++++++++++++++++++++++++ exporter/signalfxexporter/README.md | 4 ++++ exporter/signalfxexporter/config_test.go | 20 +++++++++++------- exporter/signalfxexporter/factory.go | 16 ++++++++------ 4 files changed, 53 insertions(+), 14 deletions(-) create mode 100755 .chloggen/sfx-exporter-http2.yaml diff --git a/.chloggen/sfx-exporter-http2.yaml b/.chloggen/sfx-exporter-http2.yaml new file mode 100755 index 000000000000..d8424804c8fb --- /dev/null +++ b/.chloggen/sfx-exporter-http2.yaml @@ -0,0 +1,27 @@ +# Use this changelog template to create an entry for release notes. + +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: enhancement + +# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) +component: signalfxexporter + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Enable HTTP/2 health check by default + +# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. +issues: [29716] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: + +# If your change doesn't affect end users or the exported elements of any package, +# you should instead start your pull request title with [chore] or use the "Skip Changelog" label. +# Optional: The change log or logs in which this entry should be included. +# e.g. '[user]' or '[user, api]' +# Include 'user' if the change is relevant to end users. +# Include 'api' if there is a change to a library API. +# Default: '[user]' +change_logs: [] diff --git a/exporter/signalfxexporter/README.md b/exporter/signalfxexporter/README.md index df5743120a24..468f9741aef4 100644 --- a/exporter/signalfxexporter/README.md +++ b/exporter/signalfxexporter/README.md @@ -86,6 +86,10 @@ The following configuration options can also be configured: defined in `translation/constants.go`. - `timeout` (default = 10s): Amount of time to wait for a send operation to complete. +- `http2_read_idle_timeout` (default = 10s): Send a ping frame for a health check if the connection has been idle for the configured value. + 0s means http/2 health check will be disabled. +- `http2_ping_timeout` (default = 10s): Triggered by `http2_read_idle_timeout`; When there's no response to the ping within the configured value, + the connection will be closed. If this value is set to 0, it will default to 15s. - `headers` (no default): Headers to pass in the payload. - `max_idle_conns` (default = 100): The maximum idle HTTP connections the client can keep open. - `max_idle_conns_per_host` (default = 100): The maximum idle HTTP connections the client can keep open per host. diff --git a/exporter/signalfxexporter/config_test.go b/exporter/signalfxexporter/config_test.go index 8b00d47181a5..d50ae80853f7 100644 --- a/exporter/signalfxexporter/config_test.go +++ b/exporter/signalfxexporter/config_test.go @@ -48,11 +48,13 @@ func TestLoadConfig(t *testing.T) { AccessToken: "testToken", Realm: "ap0", HTTPClientSettings: confighttp.HTTPClientSettings{ - Timeout: 10 * time.Second, - Headers: nil, - MaxIdleConns: &hundred, - MaxIdleConnsPerHost: &hundred, - IdleConnTimeout: &idleConnTimeout, + Timeout: 10 * time.Second, + Headers: nil, + MaxIdleConns: &hundred, + MaxIdleConnsPerHost: &hundred, + IdleConnTimeout: &idleConnTimeout, + HTTP2ReadIdleTimeout: 10 * time.Second, + HTTP2PingTimeout: 10 * time.Second, }, RetrySettings: exporterhelper.RetrySettings{ Enabled: true, @@ -114,9 +116,11 @@ func TestLoadConfig(t *testing.T) { "added-entry": "added value", "dot.test": "test", }, - MaxIdleConns: &seventy, - MaxIdleConnsPerHost: &seventy, - IdleConnTimeout: &idleConnTimeout, + MaxIdleConns: &seventy, + MaxIdleConnsPerHost: &seventy, + IdleConnTimeout: &idleConnTimeout, + HTTP2ReadIdleTimeout: 10 * time.Second, + HTTP2PingTimeout: 10 * time.Second, }, RetrySettings: exporterhelper.RetrySettings{ Enabled: true, diff --git a/exporter/signalfxexporter/factory.go b/exporter/signalfxexporter/factory.go index 74dc75249a43..2ffb1d843f95 100644 --- a/exporter/signalfxexporter/factory.go +++ b/exporter/signalfxexporter/factory.go @@ -22,8 +22,10 @@ import ( ) const ( - defaultHTTPTimeout = time.Second * 10 - defaultMaxConns = 100 + defaultHTTPTimeout = time.Second * 10 + defaultHTTP2ReadIdleTimeout = time.Second * 10 + defaultHTTP2PingTimeout = time.Second * 10 + defaultMaxConns = 100 defaultDimMaxBuffered = 10000 defaultDimSendDelay = 10 * time.Second @@ -52,10 +54,12 @@ func createDefaultConfig() component.Config { RetrySettings: exporterhelper.NewDefaultRetrySettings(), QueueSettings: exporterhelper.NewDefaultQueueSettings(), HTTPClientSettings: confighttp.HTTPClientSettings{ - Timeout: defaultHTTPTimeout, - MaxIdleConns: &maxConnCount, - MaxIdleConnsPerHost: &maxConnCount, - IdleConnTimeout: &idleConnTimeout, + Timeout: defaultHTTPTimeout, + MaxIdleConns: &maxConnCount, + MaxIdleConnsPerHost: &maxConnCount, + IdleConnTimeout: &idleConnTimeout, + HTTP2ReadIdleTimeout: defaultHTTP2ReadIdleTimeout, + HTTP2PingTimeout: defaultHTTP2PingTimeout, }, AccessTokenPassthroughConfig: splunk.AccessTokenPassthroughConfig{ AccessTokenPassthrough: true,