From 833a806ccd2eb19c86f7cc8b3aada295879caca8 Mon Sep 17 00:00:00 2001 From: Andrew Kroh Date: Mon, 5 Dec 2016 13:06:06 -0500 Subject: [PATCH] Reduce allocations in UTF16 conversion (#3113) When decoding a UTF16 string contained in a buffer larger than just the string, more space was allocated than required. ``` BenchmarkUTF16BytesToString/simple_string-4 2000000 846 ns/op 384 B/op 3 allocs/op BenchmarkUTF16BytesToString/larger_buffer-4 2000000 874 ns/op 384 B/op 3 allocs/op BenchmarkUTF16BytesToString_Original/simple_string-4 2000000 840 ns/op 384 B/op 3 allocs/op BenchmarkUTF16BytesToString_Original/larger_buffer-4 1000000 3055 ns/op 8720 B/op 3 allocs/op ``` ``` PS C:\Gopath\src\github.com\elastic\beats\winlogbeat> go test -v github.com/elastic/beats/winlogbeat/eventlog -run ^TestBenchmarkBatchReadSize$ -benchmem -benchtime 10s -benchtest === RUN TestBenchmarkBatchReadSize --- PASS: TestBenchmarkBatchReadSize (68.04s) bench_test.go:100: batch_size=10, total_events=20000, batch_time=5.682627ms, events_per_sec=1759.7494961397256, bytes_alloced_per_event=44 kB, total_allocs=4923840 bench_test.go:100: batch_size=100, total_events=30000, batch_time=53.850879ms, events_per_sec=1856.9799018508127, bytes_alloced_per_event=44 kB, total_allocs=7354285 bench_test.go:100: batch_size=500, total_events=25000, batch_time=271.118774ms, events_per_sec=1844.2101689350366, bytes_alloced_per_event=43 kB, total_allocs=6125665 bench_test.go:100: batch_size=1000, total_events=30000, batch_time=558.03918ms, events_per_sec=1791.9888707455987, bytes_alloced_per_event=43 kB, total_allocs=7350324 PASS ok github.com/elastic/beats/winlogbeat/eventlog 68.095s PS C:\Gopath\src\github.com\elastic\beats\winlogbeat> go test -v github.com/elastic/beats/winlogbeat/eventlog -run ^TestBenchmarkBatchReadSize$ -benchmem -benchtime 10s -benchtest === RUN TestBenchmarkBatchReadSize --- PASS: TestBenchmarkBatchReadSize (71.85s) bench_test.go:100: batch_size=10, total_events=30000, batch_time=5.713873ms, events_per_sec=1750.1264028794478, bytes_alloced_per_event=25 kB, total_allocs=7385820 bench_test.go:100: batch_size=100, total_events=30000, batch_time=52.454484ms, events_per_sec=1906.4147118480853, bytes_alloced_per_event=24 kB, total_allocs=7354318 bench_test.go:100: batch_size=500, total_events=25000, batch_time=260.56659ms, events_per_sec=1918.8952812407758, bytes_alloced_per_event=24 kB, total_allocs=6125688 bench_test.go:100: batch_size=1000, total_events=30000, batch_time=530.468816ms, events_per_sec=1885.124949550286, bytes_alloced_per_event=24 kB, total_allocs=7350360 PASS ok github.com/elastic/beats/winlogbeat/eventlog 71.908s ``` --- winlogbeat/sys/strings.go | 42 ++++++++++++++----- winlogbeat/sys/strings_test.go | 77 ++++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+), 10 deletions(-) create mode 100644 winlogbeat/sys/strings_test.go diff --git a/winlogbeat/sys/strings.go b/winlogbeat/sys/strings.go index 25f27a9d3a55..ecd71dcaaf87 100644 --- a/winlogbeat/sys/strings.go +++ b/winlogbeat/sys/strings.go @@ -6,27 +6,49 @@ import ( "unicode/utf16" ) -// UTF16BytesToString returns the Unicode code point sequence represented -// by the UTF-16 buffer b. +// UTF16BytesToString returns a string that is decoded from the UTF-16 bytes. +// The byte slice must be of even length otherwise an error will be returned. +// The integer returned is the offset to the start of the next string with +// buffer if it exists, otherwise -1 is returned. func UTF16BytesToString(b []byte) (string, int, error) { if len(b)%2 != 0 { - return "", 0, fmt.Errorf("Slice must have an even length (length=%d)", - len(b)) + return "", 0, fmt.Errorf("Slice must have an even length (length=%d)", len(b)) + } + + offset := -1 + + // Find the null terminator if it exists and re-slice the b. + if nullIndex := indexNullTerminator(b); nullIndex > 0 { + if len(b) > nullIndex+2 { + offset = nullIndex + 2 + } + + b = b[:nullIndex] } - offset := len(b)/2 + 2 s := make([]uint16, len(b)/2) for i := range s { s[i] = uint16(b[i*2]) + uint16(b[(i*2)+1])<<8 + } - if s[i] == 0 { - s = s[0:i] - offset = i*2 + 2 - break + return string(utf16.Decode(s)), offset, nil +} + +// indexNullTerminator returns the index of a null terminator within a buffer +// containing UTF-16 encoded data. If the null terminator is not found -1 is +// returned. +func indexNullTerminator(b []byte) int { + if len(b) < 2 { + return -1 + } + + for i := 0; i < len(b); i += 2 { + if b[i] == 0 && b[i+1] == 0 { + return i } } - return string(utf16.Decode(s)), offset, nil + return -1 } // RemoveWindowsLineEndings replaces carriage return line feed (CRLF) with diff --git a/winlogbeat/sys/strings_test.go b/winlogbeat/sys/strings_test.go new file mode 100644 index 000000000000..48cae4831aa5 --- /dev/null +++ b/winlogbeat/sys/strings_test.go @@ -0,0 +1,77 @@ +package sys + +import ( + "bytes" + "encoding/binary" + "testing" + "unicode/utf16" + + "github.com/stretchr/testify/assert" +) + +func toUTF16Bytes(in string) []byte { + var u16 []uint16 = utf16.Encode([]rune(in)) + buf := &bytes.Buffer{} + binary.Write(buf, binary.LittleEndian, u16) + return buf.Bytes() +} + +func TestUTF16BytesToString(t *testing.T) { + input := "abc白鵬翔\u145A6" + utf16Bytes := toUTF16Bytes(input) + + output, _, err := UTF16BytesToString(utf16Bytes) + if err != nil { + t.Fatal(err) + } + assert.Equal(t, input, output) +} + +func TestUTF16BytesToStringOffset(t *testing.T) { + in := bytes.Join([][]byte{toUTF16Bytes("one"), toUTF16Bytes("two"), toUTF16Bytes("three")}, []byte{0, 0}) + + output, offset, err := UTF16BytesToString(in) + if err != nil { + t.Fatal(err) + } + assert.Equal(t, "one", output) + assert.Equal(t, 8, offset) + + in = in[offset:] + output, offset, err = UTF16BytesToString(in) + if err != nil { + t.Fatal(err) + } + assert.Equal(t, "two", output) + assert.Equal(t, 8, offset) + + in = in[offset:] + output, offset, err = UTF16BytesToString(in) + if err != nil { + t.Fatal(err) + } + assert.Equal(t, "three", output) + assert.Equal(t, -1, offset) +} + +func BenchmarkUTF16BytesToString(b *testing.B) { + utf16Bytes := toUTF16Bytes("A logon was attempted using explicit credentials.") + + b.Run("simple_string", func(b *testing.B) { + b.ResetTimer() + + for i := 0; i < b.N; i++ { + UTF16BytesToString(utf16Bytes) + } + }) + + // Buffer larger than the string. + b.Run("larger_buffer", func(b *testing.B) { + utf16Bytes = append(utf16Bytes, make([]byte, 2048)...) + b.ResetTimer() + + for i := 0; i < b.N; i++ { + UTF16BytesToString(utf16Bytes) + } + }) +}