-
Notifications
You must be signed in to change notification settings - Fork 1.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
pkg/trace/agent: improve NormalizeTag #2951
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,6 @@ | ||
package agent | ||
|
||
import ( | ||
"bytes" | ||
"sort" | ||
"strings" | ||
"unicode" | ||
|
@@ -251,54 +250,91 @@ func FilterTags(tags, groups []string) []string { | |
// backend requirements | ||
// taken from dd-go.model.NormalizeTag | ||
func NormalizeTag(tag string) string { | ||
// unless you just throw out unicode, this is already as fast as it gets | ||
|
||
buf := bytes.NewBuffer(make([]byte, 0, 2*len(tag))) | ||
lastWasUnderscore := false | ||
|
||
for _, c := range tag { | ||
// fast path for len check | ||
if buf.Len() >= maxTagLength { | ||
var ( | ||
trim int // start character (if trimming) | ||
wiping bool // true when the previous character has been discarded | ||
wipe [][2]int // sections to discard: (start, end) pairs | ||
chars int // number of characters processed | ||
) | ||
var ( | ||
i int // current byte | ||
c rune // current rune | ||
) | ||
norm := []byte(tag) | ||
for i, c = range tag { | ||
if chars >= maxTagLength { | ||
// we've reached the maximum | ||
break | ||
} | ||
// fast path for ascii alphabetic chars | ||
// fast path; all letters are ok | ||
switch { | ||
case c >= 'a' && c <= 'z': | ||
buf.WriteRune(c) | ||
lastWasUnderscore = false | ||
chars++ | ||
wiping = false | ||
continue | ||
case c >= 'A' && c <= 'Z': | ||
c -= 'A' - 'a' | ||
buf.WriteRune(c) | ||
lastWasUnderscore = false | ||
// lower-case | ||
norm[i] += 'a' - 'A' | ||
chars++ | ||
wiping = false | ||
continue | ||
} | ||
|
||
c = unicode.ToLower(c) | ||
switch { | ||
// handle always valid cases | ||
case unicode.IsLetter(c) || c == ':': | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can merge the 1st and 3rd case together? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually there is a catch here: switch case statements are evaluated top-to-bottom and the idea here is that you can not go past the second case (length = 0) if the character isn't a letter or a colon. In other words only letters or colons can start the tag name. If we merge the two, it will alter this rule. |
||
buf.WriteRune(c) | ||
lastWasUnderscore = false | ||
// skip any characters that can't start the string | ||
case buf.Len() == 0: | ||
chars++ | ||
wiping = false | ||
case chars == 0: | ||
// this character can not start the string, trim | ||
trim = i + 1 | ||
continue | ||
// handle valid characters that can't start the string. | ||
case unicode.IsDigit(c) || c == '.' || c == '/' || c == '-': | ||
buf.WriteRune(c) | ||
lastWasUnderscore = false | ||
// convert anything else to underscores (including underscores), but only allow one in a row. | ||
case !lastWasUnderscore: | ||
buf.WriteRune('_') | ||
lastWasUnderscore = true | ||
chars++ | ||
wiping = false | ||
default: | ||
// illegal character | ||
if !wiping { | ||
// start a new cut | ||
wipe = append(wipe, [2]int{i, i + 1}) | ||
wiping = true | ||
} else { | ||
// lengthen current cut | ||
wipe[len(wipe)-1][1]++ | ||
} | ||
} | ||
} | ||
|
||
// strip trailing underscores | ||
if lastWasUnderscore { | ||
b := buf.Bytes() | ||
return string(b[:len(b)-1]) | ||
norm = norm[trim : i+1] // trim start and end | ||
if len(wipe) == 0 { | ||
// tag was ok, return it as it is | ||
return string(norm) | ||
} | ||
delta := trim // cut offsets delta | ||
for _, cut := range wipe { | ||
// start and end of cut, including delta from previous cuts: | ||
start, end := cut[0]-delta, cut[1]-delta | ||
|
||
if end >= len(norm) { | ||
// this cut includes the end of the string; discard it | ||
// completely and finish the loop. | ||
norm = norm[:start] | ||
break | ||
} | ||
// replace the beginning of the cut with '_' | ||
norm[start] = '_' | ||
if end-start == 1 { | ||
// nothing to discard | ||
continue | ||
} | ||
// discard remaining characters in the cut | ||
copy(norm[start+1:], norm[end:]) | ||
|
||
return buf.String() | ||
// shorten the slice | ||
norm = norm[:len(norm)-(end-start)+1] | ||
|
||
// count the new delta for future cuts | ||
delta += cut[1] - cut[0] - 1 | ||
} | ||
return string(norm) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
--- | ||
enhancements: | ||
- | | ||
APM: improve performance of NormalizeTag function. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: we can put +32 directly, saying that it moves a ascii upper-case to ascii lower-case equivalent.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The compiler should auto compute this expression so we can leave this which I think gives better readability
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree with @erichgess here. I liked this way and I even checked how the standard library does it, so I'd rather keep it if that's ok: https://github.com/golang/go/blob/05f8b44d5edc2960eff106e5e780cf83535d0533/src/unicode/letter.go#L267