From 0171d6bd24ec3a3895d02d187df20308c315dfeb Mon Sep 17 00:00:00 2001 From: Ben Hartshorne Date: Fri, 22 Nov 2024 06:44:19 -0800 Subject: [PATCH] feat: improve error string for too large events (#258) ## Which problem is this PR solving? When libhoney encounters an event that is large enough that the Honeycomb API will reject it based on [published limits](https://docs.honeycomb.io/api/tag/Events#operation/createEvent), it preemptively refuses to send the event, knowing it will not be accepted. This error is logged so that the application author will realize that some telemetry is getting dropped. `event exceeds max event size of 100000 bytes, API will not accept this event` However, it is exceedingly difficult to track down what telemetry it is that is getting dropped. There are no clues about where in the code this event might be generated, which field is too large, or anything else. ## Short description of the changes This PR examines the too-large event for the industry-standard fields `name` and `service.name` (standardized by Open Telemetry but also used by the beelines and other instrumentation). If those fields exist it will add their values to the error message. In the cases where the Honeycomb Beeline package is being used, this will indicate which span in a large trace is the offending span, dramatically shortening the process to find what fields might be added that are too large. There's a delicate balance here of wanting to add enough information to help the instrumentation author while not adding too much data from the (already too large) event. It wouldn't do if the notification that an event was too large was, itself, too large! Because of this danger this PR does not add additional information such a list of all fields to the error message. Name and service name will at least point the application author to the right span, then visual code analysis and other experiments can help narrow down what might be exceeding the event's size. --------- Co-authored-by: Mike Goldsmith --- transmission/transmission.go | 21 ++++++- transmission/transmission_test.go | 100 ++++++++++++++++++++---------- 2 files changed, 85 insertions(+), 36 deletions(-) diff --git a/transmission/transmission.go b/transmission/transmission.go index 243b193..1ee6d36 100644 --- a/transmission/transmission.go +++ b/transmission/transmission.go @@ -586,8 +586,11 @@ func (b *batchAgg) encodeBatchJSON(events []*Event) ([]byte, int) { } // if the event is too large to ever send, add an error to the queue if len(evByt) > apiEventSizeMax { + // if this event happens to have a `name` or `service.name` field, include them for easier debugging + extraInfo := getExceedsMaxEventSizeExtraInfo(ev) + // log the error and enqueue a response b.enqueueResponse(Response{ - Err: fmt.Errorf("event exceeds max event size of %d bytes, API will not accept this event", apiEventSizeMax), + Err: fmt.Errorf("event exceeds max event size of %d bytes, API will not accept this event.%s", apiEventSizeMax, extraInfo), Metadata: ev.Metadata, }) events[i] = nil @@ -641,8 +644,11 @@ func (b *batchAgg) encodeBatchMsgp(events []*Event) ([]byte, int) { } // if the event is too large to ever send, add an error to the queue if len(evByt) > apiEventSizeMax { + // if this event happens to have a `name` or `service.name` field, include them for easier debugging + extraInfo := getExceedsMaxEventSizeExtraInfo(ev) + // log the error and enqueue a response b.enqueueResponse(Response{ - Err: fmt.Errorf("event exceeds max event size of %d bytes, API will not accept this event", apiEventSizeMax), + Err: fmt.Errorf("event exceeds max event size of %d bytes, API will not accept this event.%s", apiEventSizeMax, extraInfo), Metadata: ev.Metadata, }) events[i] = nil @@ -742,3 +748,14 @@ type nower interface { func buildRequestURL(apiHost, dataset string) (string, error) { return url.JoinPath(apiHost, "/1/batch", url.PathEscape(dataset)) } + +func getExceedsMaxEventSizeExtraInfo(ev *Event) string { + var extraInfo string + if evName, ok := ev.Data["name"]; ok { + extraInfo = fmt.Sprintf(" Name: \"%v\"", evName) + } + if evServiceName, ok := ev.Data["service.name"]; ok { + extraInfo = fmt.Sprintf("%s Service Name: \"%v\"", extraInfo, evServiceName) + } + return extraInfo +} diff --git a/transmission/transmission_test.go b/transmission/transmission_test.go index 3b025b5..010c254 100644 --- a/transmission/transmission_test.go +++ b/transmission/transmission_test.go @@ -771,45 +771,77 @@ func TestFireBatchLargeEventsSent(t *testing.T) { }) } -// Ensure we handle events greater than the limit by enqueuing a response +// Ensure we handle events greater than the limit by enqueuing a response and +// confirm their error messages include name and service name if they exist func TestFireBatchWithTooLargeEvent(t *testing.T) { - var doMsgpack bool - withJSONAndMsgpack(t, &doMsgpack, func(t *testing.T) { - trt := &testRoundTripper{} - b := &batchAgg{ - httpClient: &http.Client{Transport: trt}, - testNower: &fakeNower{}, - testBlocker: &sync.WaitGroup{}, - responses: make(chan Response, 1), - metrics: &nullMetrics{}, - enableMsgpackEncoding: doMsgpack, - } + tsts := []struct { + desc string + fhData map[string]interface{} + expectedErr string + }{ + { + desc: "large event", + fhData: map[string]interface{}{"reallyREALLYBigColumn": randomString(1024 * 1024)}, + expectedErr: "event exceeds max event size of 100000 bytes, API will not accept this event.", + }, { + desc: "large event with a name", + fhData: map[string]interface{}{"name": "namae", "reallyREALLYBigColumn": randomString(1024 * 1024)}, + expectedErr: "event exceeds max event size of 100000 bytes, API will not accept this event. Name: \"namae\"", + }, { + desc: "large event with a service name", + fhData: map[string]interface{}{"service.name": "servicio", "reallyREALLYBigColumn": randomString(1024 * 1024)}, + expectedErr: "event exceeds max event size of 100000 bytes, API will not accept this event. Service Name: \"servicio\"", + }, { + desc: "large event with a name and service name", + fhData: map[string]interface{}{"name": "nom", "service.name": "servicio", "reallyREALLYBigColumn": randomString(1024 * 1024)}, + expectedErr: "event exceeds max event size of 100000 bytes, API will not accept this event. Name: \"nom\" Service Name: \"servicio\"", + }, { + desc: "large event with other fields", + fhData: map[string]interface{}{"f1": "racing", "team": "lotus", "reallyREALLYBigColumn": randomString(1024 * 1024)}, + expectedErr: "event exceeds max event size of 100000 bytes, API will not accept this event.", + }, + } - events := make([]*Event, 1) - for i := range events { - fhData := map[string]interface{}{"reallyREALLYBigColumn": randomString(1024 * 1024)} - events[i] = &Event{ - Data: fhData, - SampleRate: 4, - APIHost: "http://fakeHost:8080", - APIKey: "written", - Dataset: "ds1", - Metadata: fmt.Sprintf("meta %d", i), + for _, tt := range tsts { + var doMsgpack bool + withJSONAndMsgpack(t, &doMsgpack, func(t *testing.T) { + trt := &testRoundTripper{} + b := &batchAgg{ + httpClient: &http.Client{Transport: trt}, + testNower: &fakeNower{}, + testBlocker: &sync.WaitGroup{}, + responses: make(chan Response, 1), + metrics: &nullMetrics{}, + enableMsgpackEncoding: doMsgpack, } - b.Add(events[i]) - } - key := "http://fakeHost:8080_written_ds1" + events := make([]*Event, 1) + for i := range events { + fhData := tt.fhData + events[i] = &Event{ + Data: fhData, + SampleRate: 4, + APIHost: "http://fakeHost:8080", + APIKey: "written", + Dataset: "ds1", + Metadata: fmt.Sprintf("meta %d", i), + } + b.Add(events[i]) + } - b.Fire(&testNotifier{}) - b.testBlocker.Wait() - resp := testGetResponse(t, b.responses) - testEquals(t, resp.Err.Error(), "event exceeds max event size of 100000 bytes, API will not accept this event") + key := "http://fakeHost:8080_written_ds1" - testEquals(t, len(b.overflowBatches), 0) - testEquals(t, len(b.overflowBatches[key]), 0) - testEquals(t, trt.callCount, 0) - }) + b.Fire(&testNotifier{}) + b.testBlocker.Wait() + resp := testGetResponse(t, b.responses) + testEquals(t, resp.Err.Error(), tt.expectedErr, tt.desc) + + testEquals(t, len(b.overflowBatches), 0, tt.desc) + testEquals(t, len(b.overflowBatches[key]), 0, tt.desc) + testEquals(t, trt.callCount, 0, tt.desc) + + }) + } } // Ensure we can deal with batches whose first event won't json encode @@ -849,7 +881,7 @@ func TestFireBatchWithBrokenFirstEvent(t *testing.T) { b.testBlocker.Wait() resp := testGetResponse(t, b.responses) testEquals(t, resp.Metadata, "meta 0") - testEquals(t, resp.Err.Error(), "event exceeds max event size of 100000 bytes, API will not accept this event") + testEquals(t, resp.Err.Error(), "event exceeds max event size of 100000 bytes, API will not accept this event.") resp = testGetResponse(t, b.responses) testEquals(t, resp.Metadata, "meta 1") testEquals(t, resp.StatusCode, 202)