Skip to content

Commit

Permalink
feat: improve error string for too large events (#258)
Browse files Browse the repository at this point in the history
## Which problem is this PR solving?

When libhoney encounters an event that is large enough that the
Honeycomb API will reject it based on [published
limits](https://docs.honeycomb.io/api/tag/Events#operation/createEvent),
it preemptively refuses to send the event, knowing it will not be
accepted. This error is logged so that the application author will
realize that some telemetry is getting dropped.

`event exceeds max event size of 100000 bytes, API will not accept this
event`

However, it is exceedingly difficult to track down what telemetry it is
that is getting dropped. There are no clues about where in the code this
event might be generated, which field is too large, or anything else.


## Short description of the changes

This PR examines the too-large event for the industry-standard fields
`name` and `service.name` (standardized by Open Telemetry but also used
by the beelines and other instrumentation). If those fields exist it
will add their values to the error message. In the cases where the
Honeycomb Beeline package is being used, this will indicate which span
in a large trace is the offending span, dramatically shortening the
process to find what fields might be added that are too large.

There's a delicate balance here of wanting to add enough information to
help the instrumentation author while not adding too much data from the
(already too large) event. It wouldn't do if the notification that an
event was too large was, itself, too large! Because of this danger this
PR does not add additional information such a list of all fields to the
error message. Name and service name will at least point the application
author to the right span, then visual code analysis and other
experiments can help narrow down what might be exceeding the event's
size.

---------

Co-authored-by: Mike Goldsmith <goldsmith.mike@gmail.com>
  • Loading branch information
maplebed and MikeGoldsmith authored Nov 22, 2024
1 parent 1b52a22 commit 0171d6b
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 36 deletions.
21 changes: 19 additions & 2 deletions transmission/transmission.go
Original file line number Diff line number Diff line change
Expand Up @@ -586,8 +586,11 @@ func (b *batchAgg) encodeBatchJSON(events []*Event) ([]byte, int) {
}
// if the event is too large to ever send, add an error to the queue
if len(evByt) > apiEventSizeMax {
// if this event happens to have a `name` or `service.name` field, include them for easier debugging
extraInfo := getExceedsMaxEventSizeExtraInfo(ev)
// log the error and enqueue a response
b.enqueueResponse(Response{
Err: fmt.Errorf("event exceeds max event size of %d bytes, API will not accept this event", apiEventSizeMax),
Err: fmt.Errorf("event exceeds max event size of %d bytes, API will not accept this event.%s", apiEventSizeMax, extraInfo),
Metadata: ev.Metadata,
})
events[i] = nil
Expand Down Expand Up @@ -641,8 +644,11 @@ func (b *batchAgg) encodeBatchMsgp(events []*Event) ([]byte, int) {
}
// if the event is too large to ever send, add an error to the queue
if len(evByt) > apiEventSizeMax {
// if this event happens to have a `name` or `service.name` field, include them for easier debugging
extraInfo := getExceedsMaxEventSizeExtraInfo(ev)
// log the error and enqueue a response
b.enqueueResponse(Response{
Err: fmt.Errorf("event exceeds max event size of %d bytes, API will not accept this event", apiEventSizeMax),
Err: fmt.Errorf("event exceeds max event size of %d bytes, API will not accept this event.%s", apiEventSizeMax, extraInfo),
Metadata: ev.Metadata,
})
events[i] = nil
Expand Down Expand Up @@ -742,3 +748,14 @@ type nower interface {
func buildRequestURL(apiHost, dataset string) (string, error) {
return url.JoinPath(apiHost, "/1/batch", url.PathEscape(dataset))
}

func getExceedsMaxEventSizeExtraInfo(ev *Event) string {
var extraInfo string
if evName, ok := ev.Data["name"]; ok {
extraInfo = fmt.Sprintf(" Name: \"%v\"", evName)
}
if evServiceName, ok := ev.Data["service.name"]; ok {
extraInfo = fmt.Sprintf("%s Service Name: \"%v\"", extraInfo, evServiceName)
}
return extraInfo
}
100 changes: 66 additions & 34 deletions transmission/transmission_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -771,45 +771,77 @@ func TestFireBatchLargeEventsSent(t *testing.T) {
})
}

// Ensure we handle events greater than the limit by enqueuing a response
// Ensure we handle events greater than the limit by enqueuing a response and
// confirm their error messages include name and service name if they exist
func TestFireBatchWithTooLargeEvent(t *testing.T) {
var doMsgpack bool
withJSONAndMsgpack(t, &doMsgpack, func(t *testing.T) {
trt := &testRoundTripper{}
b := &batchAgg{
httpClient: &http.Client{Transport: trt},
testNower: &fakeNower{},
testBlocker: &sync.WaitGroup{},
responses: make(chan Response, 1),
metrics: &nullMetrics{},
enableMsgpackEncoding: doMsgpack,
}
tsts := []struct {
desc string
fhData map[string]interface{}
expectedErr string
}{
{
desc: "large event",
fhData: map[string]interface{}{"reallyREALLYBigColumn": randomString(1024 * 1024)},
expectedErr: "event exceeds max event size of 100000 bytes, API will not accept this event.",
}, {
desc: "large event with a name",
fhData: map[string]interface{}{"name": "namae", "reallyREALLYBigColumn": randomString(1024 * 1024)},
expectedErr: "event exceeds max event size of 100000 bytes, API will not accept this event. Name: \"namae\"",
}, {
desc: "large event with a service name",
fhData: map[string]interface{}{"service.name": "servicio", "reallyREALLYBigColumn": randomString(1024 * 1024)},
expectedErr: "event exceeds max event size of 100000 bytes, API will not accept this event. Service Name: \"servicio\"",
}, {
desc: "large event with a name and service name",
fhData: map[string]interface{}{"name": "nom", "service.name": "servicio", "reallyREALLYBigColumn": randomString(1024 * 1024)},
expectedErr: "event exceeds max event size of 100000 bytes, API will not accept this event. Name: \"nom\" Service Name: \"servicio\"",
}, {
desc: "large event with other fields",
fhData: map[string]interface{}{"f1": "racing", "team": "lotus", "reallyREALLYBigColumn": randomString(1024 * 1024)},
expectedErr: "event exceeds max event size of 100000 bytes, API will not accept this event.",
},
}

events := make([]*Event, 1)
for i := range events {
fhData := map[string]interface{}{"reallyREALLYBigColumn": randomString(1024 * 1024)}
events[i] = &Event{
Data: fhData,
SampleRate: 4,
APIHost: "http://fakeHost:8080",
APIKey: "written",
Dataset: "ds1",
Metadata: fmt.Sprintf("meta %d", i),
for _, tt := range tsts {
var doMsgpack bool
withJSONAndMsgpack(t, &doMsgpack, func(t *testing.T) {
trt := &testRoundTripper{}
b := &batchAgg{
httpClient: &http.Client{Transport: trt},
testNower: &fakeNower{},
testBlocker: &sync.WaitGroup{},
responses: make(chan Response, 1),
metrics: &nullMetrics{},
enableMsgpackEncoding: doMsgpack,
}
b.Add(events[i])
}

key := "http://fakeHost:8080_written_ds1"
events := make([]*Event, 1)
for i := range events {
fhData := tt.fhData
events[i] = &Event{
Data: fhData,
SampleRate: 4,
APIHost: "http://fakeHost:8080",
APIKey: "written",
Dataset: "ds1",
Metadata: fmt.Sprintf("meta %d", i),
}
b.Add(events[i])
}

b.Fire(&testNotifier{})
b.testBlocker.Wait()
resp := testGetResponse(t, b.responses)
testEquals(t, resp.Err.Error(), "event exceeds max event size of 100000 bytes, API will not accept this event")
key := "http://fakeHost:8080_written_ds1"

testEquals(t, len(b.overflowBatches), 0)
testEquals(t, len(b.overflowBatches[key]), 0)
testEquals(t, trt.callCount, 0)
})
b.Fire(&testNotifier{})
b.testBlocker.Wait()
resp := testGetResponse(t, b.responses)
testEquals(t, resp.Err.Error(), tt.expectedErr, tt.desc)

testEquals(t, len(b.overflowBatches), 0, tt.desc)
testEquals(t, len(b.overflowBatches[key]), 0, tt.desc)
testEquals(t, trt.callCount, 0, tt.desc)

})
}
}

// Ensure we can deal with batches whose first event won't json encode
Expand Down Expand Up @@ -849,7 +881,7 @@ func TestFireBatchWithBrokenFirstEvent(t *testing.T) {
b.testBlocker.Wait()
resp := testGetResponse(t, b.responses)
testEquals(t, resp.Metadata, "meta 0")
testEquals(t, resp.Err.Error(), "event exceeds max event size of 100000 bytes, API will not accept this event")
testEquals(t, resp.Err.Error(), "event exceeds max event size of 100000 bytes, API will not accept this event.")
resp = testGetResponse(t, b.responses)
testEquals(t, resp.Metadata, "meta 1")
testEquals(t, resp.StatusCode, 202)
Expand Down

0 comments on commit 0171d6b

Please sign in to comment.