From 5b7d41ad3e93020f04d1a27d109ba653752b4128 Mon Sep 17 00:00:00 2001 From: Carson Ip Date: Wed, 2 Oct 2024 13:45:05 +0100 Subject: [PATCH 1/5] Add hint in error logs for TSDB version_conflict_engine_exception error --- ...lasticsearchexporter_index-error-hint.yaml | 27 +++++++++++++++++++ exporter/elasticsearchexporter/README.md | 8 +++++- exporter/elasticsearchexporter/bulkindexer.go | 17 +++++++++--- .../elasticsearchexporter/bulkindexer_test.go | 25 ++++++++++++++++- 4 files changed, 72 insertions(+), 5 deletions(-) create mode 100644 .chloggen/elasticsearchexporter_index-error-hint.yaml diff --git a/.chloggen/elasticsearchexporter_index-error-hint.yaml b/.chloggen/elasticsearchexporter_index-error-hint.yaml new file mode 100644 index 000000000000..fad70f77154f --- /dev/null +++ b/.chloggen/elasticsearchexporter_index-error-hint.yaml @@ -0,0 +1,27 @@ +# Use this changelog template to create an entry for release notes. + +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: enhancement + +# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) +component: elasticsearchexporter + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Add hint in error logs for TSDB version_conflict_engine_exception error + +# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. +issues: [35546] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: + +# If your change doesn't affect end users or the exported elements of any package, +# you should instead start your pull request title with [chore] or use the "Skip Changelog" label. +# Optional: The change log or logs in which this entry should be included. +# e.g. '[user]' or '[user, api]' +# Include 'user' if the change is relevant to end users. +# Include 'api' if there is a change to a library API. +# Default: '[user]' +change_logs: [user] diff --git a/exporter/elasticsearchexporter/README.md b/exporter/elasticsearchexporter/README.md index 31d2cae89c6f..2bfa2fbc4333 100644 --- a/exporter/elasticsearchexporter/README.md +++ b/exporter/elasticsearchexporter/README.md @@ -337,4 +337,10 @@ Otherwise, it is mapped to an empty string (""). #### `@timestamp` -In case the record contains `timestamp`, this value is used. Otherwise, the `observed timestamp` is used. \ No newline at end of file +In case the record contains `timestamp`, this value is used. Otherwise, the `observed timestamp` is used. + +## Known issues + +### version_conflict_engine_exception + +When sending high traffic of metrics to a TSDB metrics data stream, e.g. using OTel mapping mode to a 8.16 Elasticsearch, it is possible to get error logs "failed to index document" with `error.type` "version_conflict_engine_exception" and `error.reason` containing "version conflict, document already exists". It is due to Elasticsearch grouping metrics using `@timestamp` in milliseconds precision as opposed to nanoseconds in elasticsearchexporter. This will be fixed in a future version of Elasticsearch. diff --git a/exporter/elasticsearchexporter/bulkindexer.go b/exporter/elasticsearchexporter/bulkindexer.go index 93e909cc3a3b..6c5c85ff4b9a 100644 --- a/exporter/elasticsearchexporter/bulkindexer.go +++ b/exporter/elasticsearchexporter/bulkindexer.go @@ -8,6 +8,7 @@ import ( "errors" "io" "runtime" + "strings" "sync" "sync/atomic" "time" @@ -329,12 +330,22 @@ func flushBulkIndexer( logger.Error("bulk indexer flush error", zap.Error(err)) } for _, resp := range stat.FailedDocs { - logger.Error( - "failed to index document", + fields := []zap.Field{ zap.String("index", resp.Index), zap.String("error.type", resp.Error.Type), zap.String("error.reason", resp.Error.Reason), - ) + } + if hint := getErrorHint(resp.Index, resp.Error.Type); hint != "" { + fields = append(fields, zap.String("hint", hint)) + } + logger.Error("failed to index document", fields...) } return stat, err } + +func getErrorHint(index, errorType string) string { + if strings.HasPrefix(index, ".ds-metrics-") && errorType == "version_conflict_engine_exception" { + return "check the known issues section of elasticsearchexporter" + } + return "" +} diff --git a/exporter/elasticsearchexporter/bulkindexer_test.go b/exporter/elasticsearchexporter/bulkindexer_test.go index a70cdecc25dd..4ad79dd545c5 100644 --- a/exporter/elasticsearchexporter/bulkindexer_test.go +++ b/exporter/elasticsearchexporter/bulkindexer_test.go @@ -6,6 +6,7 @@ package elasticsearchexporter import ( "context" "errors" + "fmt" "io" "net/http" "strings" @@ -119,6 +120,8 @@ func TestAsyncBulkIndexer_flush_error(t *testing.T) { tests := []struct { name string roundTripFunc func(*http.Request) (*http.Response, error) + wantMessage string + wantFields []zap.Field }{ { name: "500", @@ -129,6 +132,7 @@ func TestAsyncBulkIndexer_flush_error(t *testing.T) { Body: io.NopCloser(strings.NewReader("error")), }, nil }, + wantMessage: "bulk indexer flush error", }, { name: "429", @@ -139,12 +143,27 @@ func TestAsyncBulkIndexer_flush_error(t *testing.T) { Body: io.NopCloser(strings.NewReader("error")), }, nil }, + wantMessage: "bulk indexer flush error", }, { name: "transport error", roundTripFunc: func(*http.Request) (*http.Response, error) { return nil, errors.New("transport error") }, + wantMessage: "bulk indexer flush error", + }, + { + name: "known version conflict error", + roundTripFunc: func(*http.Request) (*http.Response, error) { + return &http.Response{ + StatusCode: 200, + Header: http.Header{"X-Elastic-Product": []string{"Elasticsearch"}}, + Body: io.NopCloser(strings.NewReader( + `{"items":[{"create":{"_index":".ds-metrics-generic.otel-default","status":400,"error":{"type":"version_conflict_engine_exception","reason":""}}}]}`)), + }, nil + }, + wantMessage: "failed to index document", + wantFields: []zap.Field{zap.String("hint", "check the known issues section of elasticsearchexporter")}, }, } @@ -169,7 +188,11 @@ func TestAsyncBulkIndexer_flush_error(t *testing.T) { time.Sleep(100 * time.Millisecond) assert.Equal(t, int64(0), bulkIndexer.stats.docsIndexed.Load()) assert.NoError(t, bulkIndexer.Close(context.Background())) - assert.Equal(t, 1, observed.FilterMessage("bulk indexer flush error").Len()) + messages := observed.FilterMessage(tt.wantMessage) + require.Equal(t, 1, messages.Len(), fmt.Sprintf("message not found; observed.All()=%v", observed.All())) + for _, wantField := range tt.wantFields { + assert.Equal(t, 1, messages.FilterField(wantField).Len(), fmt.Sprintf("message with field not found; observed.All()=%v", observed.All())) + } }) } } From d42149b3647696f99a6b71d292fd71696cd77a6f Mon Sep 17 00:00:00 2001 From: Carson Ip Date: Wed, 2 Oct 2024 13:56:52 +0100 Subject: [PATCH 2/5] make lint --- exporter/elasticsearchexporter/bulkindexer_test.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/exporter/elasticsearchexporter/bulkindexer_test.go b/exporter/elasticsearchexporter/bulkindexer_test.go index 4ad79dd545c5..43d670090256 100644 --- a/exporter/elasticsearchexporter/bulkindexer_test.go +++ b/exporter/elasticsearchexporter/bulkindexer_test.go @@ -6,7 +6,6 @@ package elasticsearchexporter import ( "context" "errors" - "fmt" "io" "net/http" "strings" @@ -189,9 +188,9 @@ func TestAsyncBulkIndexer_flush_error(t *testing.T) { assert.Equal(t, int64(0), bulkIndexer.stats.docsIndexed.Load()) assert.NoError(t, bulkIndexer.Close(context.Background())) messages := observed.FilterMessage(tt.wantMessage) - require.Equal(t, 1, messages.Len(), fmt.Sprintf("message not found; observed.All()=%v", observed.All())) + require.Equal(t, 1, messages.Len(), "message not found; observed.All()=%v", observed.All()) for _, wantField := range tt.wantFields { - assert.Equal(t, 1, messages.FilterField(wantField).Len(), fmt.Sprintf("message with field not found; observed.All()=%v", observed.All())) + assert.Equal(t, 1, messages.FilterField(wantField).Len(), "message with field not found; observed.All()=%v", observed.All()) } }) } From 06c7a97f4175de5c5947f41ab00588747f564945 Mon Sep 17 00:00:00 2001 From: Carson Ip Date: Wed, 2 Oct 2024 14:17:14 +0100 Subject: [PATCH 3/5] Call out misconfiguration --- exporter/elasticsearchexporter/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/exporter/elasticsearchexporter/README.md b/exporter/elasticsearchexporter/README.md index 2bfa2fbc4333..ca52ada03a33 100644 --- a/exporter/elasticsearchexporter/README.md +++ b/exporter/elasticsearchexporter/README.md @@ -344,3 +344,5 @@ In case the record contains `timestamp`, this value is used. Otherwise, the `obs ### version_conflict_engine_exception When sending high traffic of metrics to a TSDB metrics data stream, e.g. using OTel mapping mode to a 8.16 Elasticsearch, it is possible to get error logs "failed to index document" with `error.type` "version_conflict_engine_exception" and `error.reason` containing "version conflict, document already exists". It is due to Elasticsearch grouping metrics using `@timestamp` in milliseconds precision as opposed to nanoseconds in elasticsearchexporter. This will be fixed in a future version of Elasticsearch. + +However, if `@timestamp` precision is not the problem, it is possible that the error indicates an actual violation of the [single writer principle](https://opentelemetry.io/docs/specs/otel/metrics/data-model/#single-writer). Check your metrics pipeline setup for misconfiguration. From 6c2f5b87e9ccfcc19e9d01f7229a08a44b581f7b Mon Sep 17 00:00:00 2001 From: Carson Ip Date: Wed, 2 Oct 2024 16:56:03 +0100 Subject: [PATCH 4/5] Mention workaround --- exporter/elasticsearchexporter/README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/exporter/elasticsearchexporter/README.md b/exporter/elasticsearchexporter/README.md index ca52ada03a33..1f7c9f5fb9c5 100644 --- a/exporter/elasticsearchexporter/README.md +++ b/exporter/elasticsearchexporter/README.md @@ -343,6 +343,8 @@ In case the record contains `timestamp`, this value is used. Otherwise, the `obs ### version_conflict_engine_exception -When sending high traffic of metrics to a TSDB metrics data stream, e.g. using OTel mapping mode to a 8.16 Elasticsearch, it is possible to get error logs "failed to index document" with `error.type` "version_conflict_engine_exception" and `error.reason` containing "version conflict, document already exists". It is due to Elasticsearch grouping metrics using `@timestamp` in milliseconds precision as opposed to nanoseconds in elasticsearchexporter. This will be fixed in a future version of Elasticsearch. +When sending high traffic of metrics to a TSDB metrics data stream, e.g. using OTel mapping mode to a 8.16 Elasticsearch, it is possible to get error logs "failed to index document" with `error.type` "version_conflict_engine_exception" and `error.reason` containing "version conflict, document already exists". It is due to Elasticsearch grouping metrics with the same dimensions, whether it is the same or different metric name, using `@timestamp` in milliseconds precision as opposed to nanoseconds in elasticsearchexporter. -However, if `@timestamp` precision is not the problem, it is possible that the error indicates an actual violation of the [single writer principle](https://opentelemetry.io/docs/specs/otel/metrics/data-model/#single-writer). Check your metrics pipeline setup for misconfiguration. +This will be fixed in a future version of Elasticsearch. A possible workaround would be to use a transform processor to truncate the timestamp, but this will cause duplicate data to be dropped silently. + +However, if `@timestamp` precision is not the problem, check your metrics pipeline setup for misconfiguration that causes an actual violation of the [single writer principle](https://opentelemetry.io/docs/specs/otel/metrics/data-model/#single-writer). From f720a66291dbdcd6cc18a48065b3f65292719c77 Mon Sep 17 00:00:00 2001 From: Carson Ip Date: Thu, 3 Oct 2024 09:40:16 +0100 Subject: [PATCH 5/5] Update hint message --- exporter/elasticsearchexporter/bulkindexer.go | 2 +- exporter/elasticsearchexporter/bulkindexer_test.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/exporter/elasticsearchexporter/bulkindexer.go b/exporter/elasticsearchexporter/bulkindexer.go index 6c5c85ff4b9a..471ddc2dc7b9 100644 --- a/exporter/elasticsearchexporter/bulkindexer.go +++ b/exporter/elasticsearchexporter/bulkindexer.go @@ -345,7 +345,7 @@ func flushBulkIndexer( func getErrorHint(index, errorType string) string { if strings.HasPrefix(index, ".ds-metrics-") && errorType == "version_conflict_engine_exception" { - return "check the known issues section of elasticsearchexporter" + return "check the \"Known issues\" section of Elasticsearch Exporter docs" } return "" } diff --git a/exporter/elasticsearchexporter/bulkindexer_test.go b/exporter/elasticsearchexporter/bulkindexer_test.go index 43d670090256..b417942734d6 100644 --- a/exporter/elasticsearchexporter/bulkindexer_test.go +++ b/exporter/elasticsearchexporter/bulkindexer_test.go @@ -162,7 +162,7 @@ func TestAsyncBulkIndexer_flush_error(t *testing.T) { }, nil }, wantMessage: "failed to index document", - wantFields: []zap.Field{zap.String("hint", "check the known issues section of elasticsearchexporter")}, + wantFields: []zap.Field{zap.String("hint", "check the \"Known issues\" section of Elasticsearch Exporter docs")}, }, }