Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pkg/trace/agent: improve NormalizeTag #2951

Merged
merged 2 commits into from
Jan 28, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 68 additions & 32 deletions pkg/trace/agent/tags.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package agent

import (
"bytes"
"sort"
"strings"
"unicode"
Expand Down Expand Up @@ -251,54 +250,91 @@ func FilterTags(tags, groups []string) []string {
// backend requirements
// taken from dd-go.model.NormalizeTag
func NormalizeTag(tag string) string {
// unless you just throw out unicode, this is already as fast as it gets

buf := bytes.NewBuffer(make([]byte, 0, 2*len(tag)))
lastWasUnderscore := false

for _, c := range tag {
// fast path for len check
if buf.Len() >= maxTagLength {
var (
trim int // start character (if trimming)
wiping bool // true when the previous character has been discarded
wipe [][2]int // sections to discard: (start, end) pairs
chars int // number of characters processed
)
var (
i int // current byte
c rune // current rune
)
norm := []byte(tag)
for i, c = range tag {
if chars >= maxTagLength {
// we've reached the maximum
break
}
// fast path for ascii alphabetic chars
// fast path; all letters are ok
switch {
case c >= 'a' && c <= 'z':
buf.WriteRune(c)
lastWasUnderscore = false
chars++
wiping = false
continue
case c >= 'A' && c <= 'Z':
c -= 'A' - 'a'
buf.WriteRune(c)
lastWasUnderscore = false
// lower-case
norm[i] += 'a' - 'A'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: we can put +32 directly, saying that it moves a ascii upper-case to ascii lower-case equivalent.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The compiler should auto compute this expression so we can leave this which I think gives better readability

Copy link
Contributor Author

@gbbr gbbr Jan 28, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with @erichgess here. I liked this way and I even checked how the standard library does it, so I'd rather keep it if that's ok: https://github.com/golang/go/blob/05f8b44d5edc2960eff106e5e780cf83535d0533/src/unicode/letter.go#L267

chars++
wiping = false
continue
}

c = unicode.ToLower(c)
switch {
// handle always valid cases
case unicode.IsLetter(c) || c == ':':
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can merge the 1st and 3rd case together?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually there is a catch here: switch case statements are evaluated top-to-bottom and the idea here is that you can not go past the second case (length = 0) if the character isn't a letter or a colon. In other words only letters or colons can start the tag name.

If we merge the two, it will alter this rule.

buf.WriteRune(c)
lastWasUnderscore = false
// skip any characters that can't start the string
case buf.Len() == 0:
chars++
wiping = false
case chars == 0:
// this character can not start the string, trim
trim = i + 1
continue
// handle valid characters that can't start the string.
case unicode.IsDigit(c) || c == '.' || c == '/' || c == '-':
buf.WriteRune(c)
lastWasUnderscore = false
// convert anything else to underscores (including underscores), but only allow one in a row.
case !lastWasUnderscore:
buf.WriteRune('_')
lastWasUnderscore = true
chars++
wiping = false
default:
// illegal character
if !wiping {
// start a new cut
wipe = append(wipe, [2]int{i, i + 1})
wiping = true
} else {
// lengthen current cut
wipe[len(wipe)-1][1]++
}
}
}

// strip trailing underscores
if lastWasUnderscore {
b := buf.Bytes()
return string(b[:len(b)-1])
norm = norm[trim : i+1] // trim start and end
if len(wipe) == 0 {
// tag was ok, return it as it is
return string(norm)
}
delta := trim // cut offsets delta
for _, cut := range wipe {
// start and end of cut, including delta from previous cuts:
start, end := cut[0]-delta, cut[1]-delta

if end >= len(norm) {
// this cut includes the end of the string; discard it
// completely and finish the loop.
norm = norm[:start]
break
}
// replace the beginning of the cut with '_'
norm[start] = '_'
if end-start == 1 {
// nothing to discard
continue
}
// discard remaining characters in the cut
copy(norm[start+1:], norm[end:])

return buf.String()
// shorten the slice
norm = norm[:len(norm)-(end-start)+1]

// count the new delta for future cuts
delta += cut[1] - cut[0] - 1
}
return string(norm)
}
38 changes: 38 additions & 0 deletions pkg/trace/agent/tags_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,3 +145,41 @@ func TestTagSetKey(t *testing.T) {
ts := NewTagSetFromString("a:b,a:b:c,abc")
assert.Equal(t, ":abc,a:b,a:b:c", ts.Key())
}

func TestNormalizeTag(t *testing.T) {
for _, tt := range []struct{ in, out string }{
{in: "ok", out: "ok"},
{in: "AlsO:ök", out: "also:ök"},
{in: ":still_ok", out: ":still_ok"},
{in: "___trim", out: "trim"},
{in: "12.:trim@", out: ":trim"},
{in: "12.:trim@@", out: ":trim"},
{in: "fun:ky__tag/1", out: "fun:ky_tag/1"},
{in: "fun:ky@tag/2", out: "fun:ky_tag/2"},
{in: "fun:ky@@@tag/3", out: "fun:ky_tag/3"},
{in: "tag:1/2.3", out: "tag:1/2.3"},
{in: "---fun:k####y_ta@#g/1_@@#", out: "fun:k_y_ta_g/1"},
{in: "AlsO:œ#@ö))œk", out: "also:œ_ö_œk"},
} {
t.Run("", func(t *testing.T) {
assert.Equal(t, tt.out, NormalizeTag(tt.in), tt.in)
})
}
}

func benchNormalizeTag(tag string) func(b *testing.B) {
return func(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
NormalizeTag(tag)
}
}
}

func BenchmarkNormalizeTag(b *testing.B) {
b.Run("ok", benchNormalizeTag("good_tag"))
b.Run("trim", benchNormalizeTag("___trim_left"))
b.Run("trim-both", benchNormalizeTag("___trim_right@@#!"))
b.Run("plenty", benchNormalizeTag("fun:ky_ta@#g/1"))
b.Run("more", benchNormalizeTag("fun:k####y_ta@#g/1_@@#"))
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
enhancements:
- |
APM: improve performance of NormalizeTag function.