Skip to content

Commit

Permalink
Reduce allocations in UTF16 conversion (elastic#3113)
Browse files Browse the repository at this point in the history
When decoding a UTF16 string contained in a buffer larger than just the string, more space was allocated than required.

```
BenchmarkUTF16BytesToString/simple_string-4         	 2000000	       846 ns/op	     384 B/op	       3 allocs/op
BenchmarkUTF16BytesToString/larger_buffer-4         	 2000000	       874 ns/op	     384 B/op	       3 allocs/op
BenchmarkUTF16BytesToString_Original/simple_string-4         	 2000000	       840 ns/op	     384 B/op	       3 allocs/op
BenchmarkUTF16BytesToString_Original/larger_buffer-4         	 1000000	      3055 ns/op	    8720 B/op	       3 allocs/op
```

```
PS C:\Gopath\src\github.com\elastic\beats\winlogbeat> go test -v github.com/elastic/beats/winlogbeat/eventlog -run ^TestBenchmarkBatchReadSize$ -benchmem -benchtime 10s -benchtest
=== RUN   TestBenchmarkBatchReadSize
--- PASS: TestBenchmarkBatchReadSize (68.04s)
        bench_test.go:100: batch_size=10, total_events=20000, batch_time=5.682627ms, events_per_sec=1759.7494961397256, bytes_alloced_per_event=44 kB, total_allocs=4923840
        bench_test.go:100: batch_size=100, total_events=30000, batch_time=53.850879ms, events_per_sec=1856.9799018508127, bytes_alloced_per_event=44 kB, total_allocs=7354285
        bench_test.go:100: batch_size=500, total_events=25000, batch_time=271.118774ms, events_per_sec=1844.2101689350366, bytes_alloced_per_event=43 kB, total_allocs=6125665
        bench_test.go:100: batch_size=1000, total_events=30000, batch_time=558.03918ms, events_per_sec=1791.9888707455987, bytes_alloced_per_event=43 kB, total_allocs=7350324
PASS
ok      github.com/elastic/beats/winlogbeat/eventlog    68.095s

PS C:\Gopath\src\github.com\elastic\beats\winlogbeat> go test -v github.com/elastic/beats/winlogbeat/eventlog -run ^TestBenchmarkBatchReadSize$ -benchmem -benchtime 10s -benchtest
=== RUN   TestBenchmarkBatchReadSize
--- PASS: TestBenchmarkBatchReadSize (71.85s)
        bench_test.go:100: batch_size=10, total_events=30000, batch_time=5.713873ms, events_per_sec=1750.1264028794478, bytes_alloced_per_event=25 kB, total_allocs=7385820
        bench_test.go:100: batch_size=100, total_events=30000, batch_time=52.454484ms, events_per_sec=1906.4147118480853, bytes_alloced_per_event=24 kB, total_allocs=7354318
        bench_test.go:100: batch_size=500, total_events=25000, batch_time=260.56659ms, events_per_sec=1918.8952812407758, bytes_alloced_per_event=24 kB, total_allocs=6125688
        bench_test.go:100: batch_size=1000, total_events=30000, batch_time=530.468816ms, events_per_sec=1885.124949550286, bytes_alloced_per_event=24 kB, total_allocs=7350360
PASS
ok      github.com/elastic/beats/winlogbeat/eventlog    71.908s
```
  • Loading branch information
andrewkroh authored and ruflin committed Dec 5, 2016
1 parent 6ba7700 commit 833a806
Show file tree
Hide file tree
Showing 2 changed files with 109 additions and 10 deletions.
42 changes: 32 additions & 10 deletions winlogbeat/sys/strings.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,49 @@ import (
"unicode/utf16"
)

// UTF16BytesToString returns the Unicode code point sequence represented
// by the UTF-16 buffer b.
// UTF16BytesToString returns a string that is decoded from the UTF-16 bytes.
// The byte slice must be of even length otherwise an error will be returned.
// The integer returned is the offset to the start of the next string with
// buffer if it exists, otherwise -1 is returned.
func UTF16BytesToString(b []byte) (string, int, error) {
if len(b)%2 != 0 {
return "", 0, fmt.Errorf("Slice must have an even length (length=%d)",
len(b))
return "", 0, fmt.Errorf("Slice must have an even length (length=%d)", len(b))
}

offset := -1

// Find the null terminator if it exists and re-slice the b.
if nullIndex := indexNullTerminator(b); nullIndex > 0 {
if len(b) > nullIndex+2 {
offset = nullIndex + 2
}

b = b[:nullIndex]
}

offset := len(b)/2 + 2
s := make([]uint16, len(b)/2)
for i := range s {
s[i] = uint16(b[i*2]) + uint16(b[(i*2)+1])<<8
}

if s[i] == 0 {
s = s[0:i]
offset = i*2 + 2
break
return string(utf16.Decode(s)), offset, nil
}

// indexNullTerminator returns the index of a null terminator within a buffer
// containing UTF-16 encoded data. If the null terminator is not found -1 is
// returned.
func indexNullTerminator(b []byte) int {
if len(b) < 2 {
return -1
}

for i := 0; i < len(b); i += 2 {
if b[i] == 0 && b[i+1] == 0 {
return i
}
}

return string(utf16.Decode(s)), offset, nil
return -1
}

// RemoveWindowsLineEndings replaces carriage return line feed (CRLF) with
Expand Down
77 changes: 77 additions & 0 deletions winlogbeat/sys/strings_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package sys

import (
"bytes"
"encoding/binary"
"testing"
"unicode/utf16"

"github.com/stretchr/testify/assert"
)

func toUTF16Bytes(in string) []byte {
var u16 []uint16 = utf16.Encode([]rune(in))
buf := &bytes.Buffer{}
binary.Write(buf, binary.LittleEndian, u16)
return buf.Bytes()
}

func TestUTF16BytesToString(t *testing.T) {
input := "abc白鵬翔\u145A6"
utf16Bytes := toUTF16Bytes(input)

output, _, err := UTF16BytesToString(utf16Bytes)
if err != nil {
t.Fatal(err)
}
assert.Equal(t, input, output)
}

func TestUTF16BytesToStringOffset(t *testing.T) {
in := bytes.Join([][]byte{toUTF16Bytes("one"), toUTF16Bytes("two"), toUTF16Bytes("three")}, []byte{0, 0})

output, offset, err := UTF16BytesToString(in)
if err != nil {
t.Fatal(err)
}
assert.Equal(t, "one", output)
assert.Equal(t, 8, offset)

in = in[offset:]
output, offset, err = UTF16BytesToString(in)
if err != nil {
t.Fatal(err)
}
assert.Equal(t, "two", output)
assert.Equal(t, 8, offset)

in = in[offset:]
output, offset, err = UTF16BytesToString(in)
if err != nil {
t.Fatal(err)
}
assert.Equal(t, "three", output)
assert.Equal(t, -1, offset)
}

func BenchmarkUTF16BytesToString(b *testing.B) {
utf16Bytes := toUTF16Bytes("A logon was attempted using explicit credentials.")

b.Run("simple_string", func(b *testing.B) {
b.ResetTimer()

for i := 0; i < b.N; i++ {
UTF16BytesToString(utf16Bytes)
}
})

// Buffer larger than the string.
b.Run("larger_buffer", func(b *testing.B) {
utf16Bytes = append(utf16Bytes, make([]byte, 2048)...)
b.ResetTimer()

for i := 0; i < b.N; i++ {
UTF16BytesToString(utf16Bytes)
}
})
}

0 comments on commit 833a806

Please sign in to comment.